Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ability to add global scorers #178

Merged
merged 2 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/yellow-trees-pull.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@empiricalrun/cli": minor
---

feat: ability to add global scorers
21 changes: 8 additions & 13 deletions docs/quickstart.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,7 @@ Our test will succeed if the model outputs valid JSON.
"type": "model",
"provider": "openai",
"model": "gpt-3.5-turbo",
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
"scorers": [
{
"type": "is-json"
}
]
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
},
{
"type": "model",
Expand All @@ -96,12 +91,7 @@ Our test will succeed if the model outputs valid JSON.
"type": "json_object"
}
},
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
"scorers": [
{
"type": "is-json"
}
]
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
}
],
"dataset": {
Expand All @@ -117,7 +107,12 @@ Our test will succeed if the model outputs valid JSON.
}
}
]
}
},
"scorers": [
{
"type": "is-json"
}
]
}
```
</Accordion>
Expand Down
5 changes: 0 additions & 5 deletions docs/scoring/basics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@ as you like.

```json empiricalrc.json
{
"type": "model",
"name": "gpt-3.5-turbo run",
"provider": "openai",
"model": "gpt-3.5-turbo",
"prompt": "Always respond with a JSON object.",
"scorers": [
{
"type": "is-json"
Expand Down
21 changes: 8 additions & 13 deletions examples/basic/empiricalrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,7 @@
"type": "model",
"provider": "openai",
"model": "gpt-3.5-turbo",
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}",
"scorers": [
{
"type": "is-json"
}
]
"prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}"
},
{
"type": "model",
Expand All @@ -21,12 +16,7 @@
"response_format": {
"type": "json_object"
}
},
"scorers": [
{
"type": "is-json"
}
]
}
}
],
"dataset": {
Expand All @@ -42,5 +32,10 @@
}
}
]
}
},
"scorers": [
{
"type": "is-json"
}
]
}
18 changes: 9 additions & 9 deletions examples/humaneval/empiricalrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
"prompt": "Complete the following python function. Return only the completed function so that it can be directly run on a Python shell, including imports like from typing import List.\n```python\n{{prompt}}\n```",
"parameters": {
"temperature": 0.1
},
"scorers": [
{
"type": "py-script",
"path": "score.py",
"name": "unit-tests"
}
]
}
}
],
"dataset": {
"path": "HumanEval.jsonl"
}
},
"scorers": [
{
"type": "py-script",
"path": "score.py",
"name": "unit-tests"
}
]
}
24 changes: 9 additions & 15 deletions examples/rag/empiricalrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,23 @@
"path": "rag.py",
"parameters": {
"model": "gpt-3.5-turbo"
},
"scorers": [
{
"type": "py-script",
"path": "score.py"
}
]
}
},
{
"type": "py-script",
"path": "rag.py",
"parameters": {
"model": "gpt-4-turbo-preview"
},
"scorers": [
{
"type": "py-script",
"path": "score.py"
}
]
}
}
],
"dataset": {
"path": ".empiricalrun/dataset.jsonl"
}
},
"scorers": [
{
"type": "py-script",
"path": "score.py"
}
]
}
38 changes: 10 additions & 28 deletions examples/spider/empiricalrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,6 @@
"role": "user",
"content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
}
],
"scorers": [
{
"type": "sql-syntax"
},
{
"type": "py-script",
"path": "execution_accuracy.py"
}
]
},
{
Expand All @@ -38,15 +29,6 @@
"role": "user",
"content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
}
],
"scorers": [
{
"type": "sql-syntax"
},
{
"type": "py-script",
"path": "execution_accuracy.py"
}
]
},
{
Expand All @@ -62,19 +44,19 @@
"role": "user",
"content": "Question: {{question}} \n\nAnswer the above question with only the SQL query."
}
],
"scorers": [
{
"type": "sql-syntax"
},
{
"type": "py-script",
"path": "execution_accuracy.py"
}
]
}
],
"dataset": {
"path": "https://docs.google.com/spreadsheets/d/1x_p0lX2pJEyGkFoe1A9nY3q87qOJUd547f2lz99ugiM/edit#gid=0"
}
},
"scorers": [
{
"type": "sql-syntax"
},
{
"type": "py-script",
"path": "execution_accuracy.py"
}
]
}
37 changes: 26 additions & 11 deletions packages/cli/src/bin/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,31 @@ const cacheDir = ".empiricalrun";
const outputFilePath = `${cwd}/${cacheDir}/${outputFileName}`;
const runtimeOptionsPath = `${cwd}/${cacheDir}/runtime.json`;

const readConfig = async (): Promise<RunsConfig> => {
let data: string;
try {
data = (await fs.readFile(configFileFullPath)).toString();
console.log(buildSuccessLog(`read ${configFileName} file successfully`));
} catch (err) {
console.log(buildErrorLog(`Failed to read ${configFileName} file`));
console.log(yellow("Please ensure running init command first"));
process.exit(1);
}
const { runs, dataset, scorers } = JSON.parse(data) as RunsConfig;

runs.forEach((r) => {
// if scorers are not set for a run, then override it with the global scorers
if (!r.scorers && scorers) {
r.scorers = scorers;
}
});

return {
runs,
dataset,
};
};

program
.name("Empirical.run CLI")
.description(
Expand Down Expand Up @@ -90,19 +115,9 @@ program
dotenv.config({ path: runTimeOptions.envFilePath });
console.log(yellow("Initiating run..."));

let data;
const startTime = performance.now();
try {
data = await fs.readFile(configFileFullPath);
} catch (err) {
console.log(buildErrorLog(`Failed to read ${configFileName} file`));
console.log(yellow("Please ensure running init command first"));
process.exit(1);
}
const { runs, dataset: datasetConfig } = await readConfig();

console.log(buildSuccessLog(`read ${configFileName} file successfully`));
const jsonStr = data.toString();
const { runs, dataset: datasetConfig } = JSON.parse(jsonStr) as RunsConfig;
// TODO: add check here for empty runs config. Add validator of the file
let dataset: Dataset;
const store = new EmpiricalStore();
Expand Down
5 changes: 3 additions & 2 deletions packages/cli/src/types/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { RunConfig, DatasetConfig } from "@empiricalrun/types";
import { RunConfig, DatasetConfig, Scorer } from "@empiricalrun/types";

export type RunsConfig = {
$schema?: string;
runs: RunConfig[];
dataset: DatasetConfig;
$schema?: string;
scorers?: Scorer[];
};