From dc87ed06d777fbaf5d79d4583bed979387deb763 Mon Sep 17 00:00:00 2001 From: Arjun Attam Date: Mon, 8 Apr 2024 10:59:45 +0530 Subject: [PATCH 1/3] docs: for configuration file, minor fixes elsewhere --- README.md | 2 +- docs/configuration.mdx | 56 ++++++++++++++++++++++++++++++++++++++++++ docs/mint.json | 1 + docs/models/basics.mdx | 6 ++--- docs/running-in-ci.mdx | 4 +-- packages/cli/README.md | 4 +-- 6 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 docs/configuration.mdx diff --git a/README.md b/README.md index 440a2dbc..b4d7e24a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# empirical.run +# Empirical [![npm](https://img.shields.io/npm/v/@empiricalrun/cli)](https://npmjs.com/package/@empiricalrun/cli) [![Discord](https://img.shields.io/badge/discord-empirical.run-blue?logo=discord&logoColor=white&color=5d68e8)](https://discord.gg/NeR6jj8dw9) diff --git a/docs/configuration.mdx b/docs/configuration.mdx new file mode 100644 index 00000000..88abbc83 --- /dev/null +++ b/docs/configuration.mdx @@ -0,0 +1,56 @@ +--- +title: 'Configuration file' +description: 'Use a JSON file to configure your tests' +--- + +Empirical uses a JSON configuration file, which is located at `empiricalrc.json`, to describe +the test to run. This configuration is declarative, in the sense that you define what you +want to test, and Empirical will internally implement the expected behavior. + +## Configuration reference + +The `empiricalrc.json` configuration file has two high-level properties: + +- `runs`: Use this to define [model providers](./models/basics) and [scoring functions](./scoring/basics) +- `dataset`: Use this to define the [scenarios to test](./dataset/basics) + +## Code editor integration + +Your code editor can give you auto-completions and detect linting errors for this configuration +file. This uses a [JSON Schema](https://json-schema.org/) definition which is hosted by Empirical. + +There are two ways to configure the schema definition. + +### `$schema` property + +Use the `$schema` property in the configuration file to specify the JSON schema URL. + +```json empiricalrc.json +{ + "$schema": "https://assets.empirical.run/config/schema/v1.14.json", + "runs": [ + // ... + ], + "dataset": { + // ... + } +} +``` + +### Visual Studio Code + +Add the `json.schemas` property to your VS Code configuration (user or workspace). This maps +the `empiricalrc.json` file to use the JSON schema. + +```json settings.json +{ + "json.schemas": [ + { + "fileMatch": [ + "empiricalrc.json" + ], + "url": "https://assets.empirical.run/config/schema/v1.14.json" + } + ] +} +``` diff --git a/docs/mint.json b/docs/mint.json index ee0b199f..75912bb4 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -42,6 +42,7 @@ "introduction", "quickstart", "examples", + "configuration", "running-in-ci" ] }, diff --git a/docs/models/basics.mdx b/docs/models/basics.mdx index e44509ae..24934e2f 100644 --- a/docs/models/basics.mdx +++ b/docs/models/basics.mdx @@ -35,7 +35,7 @@ side-by-side comparison view in the web reporter. "provider": "openai", "model": "gpt-3.5-turbo", "prompt": "Hey I'm {{user_name}}" - }, + } ] ``` @@ -82,7 +82,7 @@ You can add other parameters or override this behavior with [passthrough](#passt "parameters": { "temperature": 0.1 } - }, + } ] ``` @@ -104,7 +104,7 @@ For example, Mistral models support a `safePrompt` parameter for [guardrailing]( "temperature": 0.1, "safePrompt": true } - }, + } ] ``` diff --git a/docs/running-in-ci.mdx b/docs/running-in-ci.mdx index 1344ae6c..4078d8a6 100644 --- a/docs/running-in-ci.mdx +++ b/docs/running-in-ci.mdx @@ -1,6 +1,6 @@ --- -title: 'Running in CI' -description: 'Automate test execution and reporting in your CI pipeline' +title: 'Run in GitHub Actions' +description: 'Automate continuous testing and reporting with CI' --- The Empirical CLI is optimized to run in CI/CD environments. This enables your team to diff --git a/packages/cli/README.md b/packages/cli/README.md index 3e552c42..5a4d228b 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -1,7 +1,7 @@ -# empirical.run CLI +# Empirical CLI [![npm](https://img.shields.io/npm/v/@empiricalrun/cli)](https://npmjs.com/package/@empiricalrun/cli) -[![Discord](https://dcbadge.vercel.app/api/server/NeR6jj8dw9?style=flat&compact=true)](https://discord.gg/NeR6jj8dw9) +[![Discord](https://img.shields.io/badge/discord-empirical.run-blue?logo=discord&logoColor=white&color=5d68e8)](https://discord.gg/NeR6jj8dw9) Empirical is the fastest way to test different LLMs, prompts and other model configurations, across all the scenarios that matter for your application. From 5baae3c3b9b360356e9a4f03056ae79940ca0089 Mon Sep 17 00:00:00 2001 From: Arjun Attam Date: Mon, 8 Apr 2024 20:14:51 +0530 Subject: [PATCH 2/3] review --- docs/configuration.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.mdx b/docs/configuration.mdx index 88abbc83..98a26665 100644 --- a/docs/configuration.mdx +++ b/docs/configuration.mdx @@ -3,7 +3,7 @@ title: 'Configuration file' description: 'Use a JSON file to configure your tests' --- -Empirical uses a JSON configuration file, which is located at `empiricalrc.json`, to describe +Empirical uses a JSON configuration file, called `empiricalrc.json`, to describe the test to run. This configuration is declarative, in the sense that you define what you want to test, and Empirical will internally implement the expected behavior. From 97b4833854d32f4c6e06958492cfbed6e3599371 Mon Sep 17 00:00:00 2001 From: Arjun Attam Date: Mon, 8 Apr 2024 20:17:20 +0530 Subject: [PATCH 3/3] docs: improvements to quick start (#96) --- README.md | 14 ++- docs/models/basics.mdx | 24 ++-- docs/quickstart.mdx | 107 +++++++++++++++++- examples/basic/empiricalrc.json | 2 - packages/cli/README.md | 14 ++- .../cli/src/runs/config/defaults/index.ts | 2 - 6 files changed, 139 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index b4d7e24a..8f9a7d02 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ With Empirical, you can: ## Usage +[See quick start on docs →](https://docs.empirical.run/quickstart) + Empirical bundles together a CLI and a web app. The CLI handles running tests and the web app visualizes results. @@ -28,16 +30,22 @@ Everything runs locally, with a JSON configuration file, `empiricalrc.json`. ### Start with a basic example -This example converts incoming unstructured user messages into structured JSON objects -using an LLM. +In this example, we will ask an LLM to parse user messages to extract entities and +give us a structured JSON output. For example, "I'm Alice from Maryland" will +become `"{name: 'Alice', location: 'Maryland'}"`. + +Our test will succeed if the model outputs valid JSON. 1. Use the CLI to create a sample configuration file called `empiricalrc.json`. ```sh npx @empiricalrun/cli init + cat empiricalrc.json ``` -2. Run the test samples against the models with the `run` command. +2. Run the test samples against the models with the `run` command. This step requires + the `OPENAI_API_KEY` environment variable to authenticate with OpenAI. This + execution will cost $0.0026, based on the selected models. ```sh npx @empiricalrun/cli run diff --git a/docs/models/basics.mdx b/docs/models/basics.mdx index 24934e2f..3a99f38c 100644 --- a/docs/models/basics.mdx +++ b/docs/models/basics.mdx @@ -3,9 +3,9 @@ title: 'Basics' description: 'Choose model providers to test with' --- -Empirical tests how different models and model configurations work for your -application. Choose the models and configurations by defining the configuration -for model providers. +Empirical can test how different models and model configurations work for your +application. You can define which models and configurations to test in the +[configuration file](../configuration). Empirical supports two types of model providers: @@ -18,12 +18,12 @@ The rest of this doc focuses on the `model` type. ## Run configuration for LLMs -Specify the `provider`, `model` and `prompt` keys to configure this. See below -for supported providers. +To test an LLM, specify the following properties in the configuration: -Use placeholders in the prompt (like `{{user_name}}`) to replace the placeholder with the -actual value from the sample inputs. See [dataset](../dataset/basics) to learn more about -sample inputs. +- `provider`: Name of the inference provider (e.g. `openai`, or other [supported providers](#supported-providers)) +- `model`: Name of the model (e.g. `gpt-3.5-turbo` or `claude-3-haiku`) +- `prompt`: Prompt sent to the model, with optional [placeholders](#placeholders) +- `name` [optional]: A name or label for this run (auto-generated if not specified) You can configure as many model providers as you like. These models will be shown in a side-by-side comparison view in the web reporter. @@ -39,6 +39,14 @@ side-by-side comparison view in the web reporter. ] ``` +### Placeholders + +Define placeholders in the prompt with Handlebars syntax (like `{{user_name}}`) to inject values +from the dataset sample. These placeholders will be replaced with the corresponding input value +during execution. + +See [dataset](../dataset/basics) to learn more about sample inputs. + ## Supported providers | Provider | Description | diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index f682fd4b..3a6c9d79 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -8,30 +8,125 @@ the web app visualizes results. Everything runs locally, with a JSON configuration file, `empiricalrc.json`. -Required: [Node.js](https://nodejs.org/en) 20+ needs to be installed on your system. +Required: Node.js 20+ needs to be installed on your system. ## Start with a basic example -This example converts incoming unstructured user messages into structured JSON objects -using an LLM. +In this example, we will ask an LLM to parse user messages to extract entities and +give us a structured JSON output. For example, "I'm Alice from Maryland" will +become `"{name: 'Alice', location: 'Maryland'}"`. -1. Use the CLI to create a sample configuration file called `empiricalrc.json`. +Our test will succeed if the model outputs valid JSON. + + + + Use the CLI to create a sample configuration file in `empiricalrc.json`. ```sh npx @empiricalrun/cli init ``` -2. Run the test samples against the models with the `run` command. + Read the file to see the configured models and dataset samples that we will test + for. The default configuration uses models from OpenAI. + + ```sh + cat empiricalrc.json + ``` + + + + Run the test samples against the models with the `run` command. ```sh npx @empiricalrun/cli run ``` -3. Use the `ui` command to open the reporter web app in your web browser and see side-by-side results. + This step requires the `OPENAI_API_KEY` environment variable to authenticate with + OpenAI. This execution will cost $0.0026, based on the selected models. + + + + Use the `ui` command to open the reporter web app in your web browser and see + side-by-side results. ```sh npx @empiricalrun/cli ui ``` + + + + GPT-4 Turbo tends to fail our JSON syntax check, because it returns outputs + in markdown syntax (with backticks ` ```json `). We can fix this behavior by enabling + [JSON mode](https://platform.openai.com/docs/guides/text-generation/json-mode). + + ```json + { + "model": "gpt-4-turbo-preview", + // ... + // Existing properties + "parameters": { + "response_format": { + "type": "json_object" + } + } + } + ``` + + + ```json empiricalrc.json + { + "runs": [ + { + "type": "model", + "provider": "openai", + "model": "gpt-3.5-turbo", + "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}", + "scorers": [ + { + "type": "is-json" + } + ] + }, + { + "type": "model", + "provider": "openai", + "model": "gpt-4-turbo-preview", + "parameters": { + "response_format": { + "type": "json_object" + } + }, + "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}", + "scorers": [ + { + "type": "is-json" + } + ] + } + ], + "dataset": { + "samples": [ + { + "inputs": { + "user_message": "Hi my name is John Doe. I'm 26 years old and I work in real estate." + } + }, + { + "inputs": { + "user_message": "This is Alice. I am a nurse from Maryland. I was born in 1990." + } + } + ] + } + } + ``` + + + Re-running the test with `npx @empiricalrun/cli run` will give us better results + for GPT-4 Turbo. + + + ## Make it yours diff --git a/examples/basic/empiricalrc.json b/examples/basic/empiricalrc.json index 179890b4..b7f19ae5 100644 --- a/examples/basic/empiricalrc.json +++ b/examples/basic/empiricalrc.json @@ -3,7 +3,6 @@ "runs": [ { "type": "model", - "name": "gpt-3.5-turbo run", "provider": "openai", "model": "gpt-3.5-turbo", "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}", @@ -15,7 +14,6 @@ }, { "type": "model", - "name": "gpt-4-turbo-preview run", "provider": "openai", "model": "gpt-4-turbo-preview", "prompt": "Extract the name, age and location from the message, and respond with a JSON object. If an entity is missing, respond with null.\n\nMessage: {{user_message}}", diff --git a/packages/cli/README.md b/packages/cli/README.md index 5a4d228b..16a47b7e 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -19,6 +19,8 @@ With Empirical, you can: ## Usage +[See quick start on docs →](https://docs.empirical.run/quickstart) + Empirical bundles together a CLI and a web app. The CLI handles running tests and the web app visualizes results. @@ -28,16 +30,22 @@ Everything runs locally, with a JSON configuration file, `empiricalrc.json`. ### Start with a basic example -This example converts incoming unstructured user messages into structured JSON objects -using an LLM. +In this example, we will ask an LLM to parse user messages to extract entities and +give us a structured JSON output. For example, "I'm Alice from Maryland" will +become `"{name: 'Alice', location: 'Maryland'}"`. + +Our test will succeed if the model outputs valid JSON. 1. Use the CLI to create a sample configuration file called `empiricalrc.json`. ```sh npx @empiricalrun/cli init + cat empiricalrc.json ``` -2. Run the test samples against the models with the `run` command. +2. Run the test samples against the models with the `run` command. This step requires + the `OPENAI_API_KEY` environment variable to authenticate with OpenAI. This + execution will cost $0.0026, based on the selected models. ```sh npx @empiricalrun/cli run diff --git a/packages/cli/src/runs/config/defaults/index.ts b/packages/cli/src/runs/config/defaults/index.ts index fb04edf7..372c666e 100644 --- a/packages/cli/src/runs/config/defaults/index.ts +++ b/packages/cli/src/runs/config/defaults/index.ts @@ -4,7 +4,6 @@ export const config: RunsConfig = { runs: [ { type: "model", - name: "gpt-3.5-turbo run", provider: "openai", model: "gpt-3.5-turbo", prompt: @@ -17,7 +16,6 @@ export const config: RunsConfig = { }, { type: "model", - name: "gpt-4-turbo-preview run", provider: "openai", model: "gpt-4-turbo-preview", prompt: