diff --git a/docs/examples.mdx b/docs/examples.mdx index b3fc2e94..e4119e39 100644 --- a/docs/examples.mdx +++ b/docs/examples.mdx @@ -4,12 +4,31 @@ description: 'Examples for common scenarios' --- + + Uses an entity extraction use-case to check for valid JSON outputs. + + + Uses an LLM to grade the output responses and ensure that they do not + contain "as a AI language model" in them. + Tests a Retrieval-augmented Generation application built with LlamaIndex, scored on - metrics from RAGAS. + metrics from Ragas. + + + Runs Empirical on an OpenAI Assistant. Uses an LLM to grade the output responses and ensure that they do not contain "as a AI language model" in them. - - Runs Empirical on an OpenAI Assistant. - - - Uses an entity extraction use-case to check for valid JSON outputs. - diff --git a/docs/mint.json b/docs/mint.json index 8c64231f..8fbe5b8e 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -44,6 +44,7 @@ "examples", "configuration", "running-in-ci", + "reporter", "telemetry" ] }, @@ -51,8 +52,16 @@ "group": "Model providers", "pages": [ "models/basics", + { + "group": "Hosted models", + "pages": [ + "models/model", + "models/providers" + ] + }, + "models/assistants", "models/custom", - "models/assistants" + "models/output" ] }, { @@ -68,12 +77,6 @@ "scoring/llm", "scoring/python" ] - }, - { - "group": "Reporter", - "pages": [ - "reporter/basics" - ] } ], "footerSocials": { diff --git a/docs/models/assistants.mdx b/docs/models/assistants.mdx index e79d6779..73387915 100644 --- a/docs/models/assistants.mdx +++ b/docs/models/assistants.mdx @@ -39,7 +39,7 @@ below, we refer to the `user_query` input from the test [dataset](../dataset/bas JSON object of parameters to customize the Assistant (see more below) - A a custom name or label for this run + A custom name or label for this run ## Example diff --git a/docs/models/basics.mdx b/docs/models/basics.mdx index a66082ed..d9245f31 100644 --- a/docs/models/basics.mdx +++ b/docs/models/basics.mdx @@ -7,231 +7,32 @@ Empirical can test how different models and model configurations work for your application. You can define which models and configurations to test in the [configuration file](../configuration). -Empirical supports three types of model providers: +Empirical supports a few types of model providers: -- `model`: API calls to LLMs that are hosted by inference platforms, like OpenAI's GPT4 -- `py-script`: Custom models or applications defined as a Python module. See [the Python guide](./custom) to configure this. -- `assistant`: API calls to OpenAI Assistants. See [the Assistants guide](./assistants) to configure this. - -The rest of this guide focuses on the `model` type. - -## Run configuration for LLMs - -To test an LLM, specify the following properties in the configuration: - -- `provider`: Name of the inference provider (e.g. `openai`, or other [supported providers](#supported-providers)) -- `model`: Name of the model (e.g. `gpt-3.5-turbo` or `claude-3-haiku`) -- `prompt`: [Prompt](#prompt) sent to the model, with optional [placeholders](#placeholders) -- `name` [optional]: A name or label for this run (auto-generated if not specified) - -You can configure as many model providers as you like. These models will be shown in a -side-by-side comparison view in the web reporter. - -```json empiricalrc.json -"runs": [ - { - "type": "model", - "provider": "openai", - "model": "gpt-3.5-turbo", - "prompt": "Hey I'm {{user_name}}" - } -] -``` - -### Prompt -The prompt serves as the initial input provided to the model to generate a response. -This property accepts either a string or a JSON chat format. - -The JSON chat format allows for a sequence of messages comprising the conversation so far. -Each message object has two required fields: -- `role`: Role of the messenger (either `system`, `user` or `assistant`) -- `content`: The content of the message - -```json empiricalrc.json -{ - "runs": [ - { - "prompt": [{ - "role": "system", - "content": "You are an SQLite expert who can convert natural language questions to SQL queries...." - }, { - "role": "user", - "content": "How many singers do we have?" - }] - } - ] -} -``` -The [Text-to-SQL example](https://github.com/empirical-run/empirical/tree/main/examples/spider) -uses this prompt format to test conversion of natural language questions to SQL queries. - -String based prompts are wrapped in `user` role message before sending to the model. -```json empiricalrc.json -{ - "runs": [ - { - "prompt": "Extract the name, age and location from the message, and respond with a JSON object ..." - } - ] -} -``` -The [basic example](https://github.com/empirical-run/empirical/tree/main/examples/basic) uses this prompt -format to test extraction of named entities from natural language text. - - -### Placeholders - -Define placeholders in the prompt with Handlebars syntax (like `{{user_name}}`) to inject values -from the dataset sample. These placeholders will be replaced with the corresponding input value -during execution. - -See [dataset](../dataset/basics) to learn more about sample inputs. - -## Supported providers - -| Provider | Description | -|----------|-------------| -| `openai` | All chat models are supported. Requires `OPENAI_API_KEY` environment variable. | -| `azure-openai` | All chat models from OpenAI that are hosted on Azure are supported. Requires `AZURE_OPENAI_API_KEY` and either of `AZURE_OPENAI_RESOURCE_NAME` or `AZURE_OPENAI_BASE_URL` environment variables. | -| `anthropic` | Claude 3 models are supported. Requires `ANTHROPIC_API_KEY` environment variable. | -| `mistral` | All chat models are supported. Requires `MISTRAL_API_KEY` environment variable. | -| `google` | Gemini Pro models are supported. Requires `GOOGLE_API_KEY` environment variable. | -| `fireworks` | Models hosted on Fireworks (e.g. `dbrx-instruct`) are supported. Requires `FIREWORKS_API_KEY` environment variable. | - - - - -#### Get API key - -- `AZURE_OPENAI_API_KEY`: This is the API key to authenticate with Azure. See [their docs](https://learn.microsoft.com/en-us/javascript/api/overview/azure/openai-readme?view=azure-node-preview#using-an-api-key-from-azure) to get the API key. - -#### Specify base url -You can specify the base URL of the Azure OpenAI endpoint by setting **either** one of the following environment variables: -- `AZURE_OPENAI_RESOURCE_NAME`: This the resource name which is used to create the endpoint base URL with the format `https://$AZURE_OPENAI_RESOURCE_NAME.openai.azure.com` -- `AZURE_OPENAI_BASE_URL`: This is if you want to specify the entire base URL used to access the chat completions API with the format `$AZURE_OPENAI_BASE_URL/openai/deployments//chat/completions`. For example - `https://some-custom-url.com` - -#### Model configuration - -In the configuration file, -- Set the `provider` to `azure-openai` -- Set `model` to the name of your model deployment - -#### Additional parameters - -- By default, the `api-version` parameter is set to "2024-02-15-preview". If you need to override this, set the `apiVersion` parameter +## Hosted models +Popular models hosted by inference platforms (e.g. GPT-4o by OpenAI) can be +directly specified in the Empirical run configuration with type as `model`. ```json -"runs": [ - { - "type": "model", - "provider": "azure-openai", - "model": "gpt-35-deployment", - "prompt": "Hey I'm {{user_name}}", - "parameters": { - "apiVersion": "2024-02-15-preview" - } - } -] -``` - - - - -#### Get API key - -The [Google AI studio](https://aistudio.google.com/) is the easiest way to get API keys. Once you have the key, -set it as the `GOOGLE_API_KEY` environment variable. - -#### Supported models - -We support the Gemini model codes, as defined in the [official docs](https://ai.google.dev/models/gemini). - -- Gemini 1.5 Pro: set `model` to `gemini-1.5-pro-latest` -- Gemini 1 Pro: set `model` to `gemini-pro` or `gemini-1.0-pro` - - - - -### Environment variables - -API calls to model providers require API keys, which are stored as environment variables. The CLI can work with: - -- Existing environment variables (using `process.env`) -- Environment variables defined in `.env` or `.env.local` files, in the current working directory - - For .env files that are located elsewhere, you can pass the `--env-file` flag - -```sh -npx @empiricalrun/cli --env-file -``` - -### Model parameters - -To override parameters like `temperature` or `max_tokens`, you can pass `parameters` alongwith the provider -configuration. All OpenAI parameters (see their [API reference](https://platform.openai.com/docs/api-reference/chat/create)) -are supported, except for a few [limitations](#limitations). - -For non-OpenAI models, we coerce these parameters to the most appropriate target parameter (e.g. `stop` in OpenAI -becomes `stop_sequences` for Anthropic.) - -You can add other parameters or override this behavior with [passthrough](#passthrough). - -```json empiricalrc.json "runs": [ { "type": "model", "provider": "openai", - "model": "gpt-3.5-turbo", - "prompt": "Hey I'm {{user_name}}", - "parameters": { - "temperature": 0.1 - } - } -] -``` - -#### Passthrough - -If your models rely on other parameters, you can still specify them in the configuration. These -parameters will be passed as-is to the model. - -For example, Mistral models support a `safePrompt` parameter for [guardrailing](https://docs.mistral.ai/platform/guardrailing/). - -```json empiricalrc.json -"runs": [ - { - "type": "model", - "provider": "mistral", - "model": "mistral-tiny", - "prompt": "Hey I'm {{user_name}}", - "parameters": { - "temperature": 0.1, - "safePrompt": true - } + "model": "gpt-4o", + "prompt": "Hey I'm {{user_name}}" } ] ``` -#### Configuring request timeout - -You can set the timeout duration in milliseconds under model parameters in the `empiricalrc.json` file. This might be required for prompt completions that are expected to take more time, for example while running models like Claude Opus. If no specific value is assigned, the default timeout duration of 30 seconds will be applied. - -```json empiricalrc.json -"runs": [ - { - "type": "model", - "provider": "anthropic", - "model": "claude-3-opus", - "prompt": "Hey I'm {{user_name}}", - "parameters": { - "timeout": 10000 - } - } -] -``` +- See how to [configure these models](./model) +- For OpenAI Assistants, see [the Assistant guide](./assistants) -#### Limitations +## Custom scripts -- These parameters are not supported today: `logit_bias`, `user`, `stream` +For mature applications, or for those that require pre or post-processing around +the model API call, it is recommended to write a custom script provider. That way, +you can reference/import parts of your application and sharing code between your +app and tests. -If this limitation is blocking your use of Empirical, please file a [feature request](https://github.com/empirical-run/empirical/issues/new). +- See [the Python guide](./custom) to configure models or apps defined as a Python module, with type `py-script` diff --git a/docs/models/custom.mdx b/docs/models/custom.mdx index 31c46fe3..6e6963f8 100644 --- a/docs/models/custom.mdx +++ b/docs/models/custom.mdx @@ -1,55 +1,78 @@ --- title: 'Custom model or app' -description: 'Specify your application or model with a Python entrypoint' +description: 'Specify your application or model with a Python script' --- -Using a Python function as entrypoint, you can define a **custom model** to test +Using a Python function as the entry-point, you can define a **custom model** to test with Empirical. This method can be also used to **test an application**, which does pre or post-processing around the LLM call or chains multiple LLM calls together. ## Run configuration -In your config file, set `type` as `py-script` and specify the Python file -path in the `path` field. +A minimal configuration looks like: ```json "runs": [ { "type": "py-script", - "path": "rag.py" + "path": "rag.py", + "parameters": { + "feature_flag": "enabled" + } } ] ``` -You can additional pass following properties in run configuration: -- **name**: `string` - a custom name to your run -- **parameters**: `object` - object to pass values to the script to modify its behavior -The Python file is expected to have a method called `execute` with the following -signature: + + Set to "py-script" + + + Specify path to the Python file, which must have a function `def execute` (see [file structure](#file-structure)) + + + JSON object of parameters passed to the `execute` method to customize script behavior + + + A custom name or label for this run (auto-generated if not specified) + -- **Arguments** - - inputs: dict of key-value pairs with [sample inputs](../dataset/basics) - - parameters: dict of key-value pairs with the run parameters -- **Returns**: an output dict with - - value (string): The response from the model/application - - metadata (dict): Custom key-value pairs that are passed on to the scorer and - web reporter +## File structure +The Python file is expected to have a method called `execute` with the following +signature: ```python rag.py -def execute(inputs): +def execute(inputs, parameters): + # call the model and other processing here # ... + + # optionally, use parameters to change script behavior + feature_flag = parameters.get("feature_flag", False) + return { - "value": output, + "value": output, # string "metadata": { - "key": value + "key": value # string } } ``` -In a RAG application, `metadata` can be used to capture the retrieved context. +### Function arguments + + + A dict object of key-value pairs with [inputs](../dataset/basics) picked up from the dataset + + + A dict object of key-value pairs that can be used to modify the script's behavior. Parameters are + defined in the [run configuration](#run-configuration). + + +### Function return type + +The function is expected to return the [output object](./output) with `value` (string) as the +required field. ## Example The [RAG example](https://github.com/empirical-run/empirical/tree/main/examples/rag) -uses this model provider to test a RAG application. +uses this model provider to test a RAG application. The `metadata` field is used to capture the retrieved context. diff --git a/docs/models/model.mdx b/docs/models/model.mdx new file mode 100644 index 00000000..421b4da8 --- /dev/null +++ b/docs/models/model.mdx @@ -0,0 +1,214 @@ +--- +title: 'Configuration' +description: 'Choose model providers to test with' +--- + +## Run configuration for LLMs + +To test an LLM, specify the following properties in the configuration + + + Should be "model" + + + Name of the inference provider (e.g. `openai`, or other [supported providers](./providers)) + + + [Prompt](#prompt) sent to the model, with optional [placeholders](#placeholders) + + + JSON object of [parameters](#model-parameters) to customize the model behavior + + + A custom name or label for this run (auto-generated if not specified) + + +You can configure as many model providers as you like. These models will be shown in a +side-by-side comparison view in the web reporter. + +```json empiricalrc.json +"runs": [ + { + "type": "model", + "provider": "openai", + "model": "gpt-3.5-turbo", + "prompt": "Hey I'm {{user_name}}" + }, + { + "type": "model", + "provider": "fireworks", + "model": "llama-v3-8b-instruct", + "prompt": "Hey I'm {{user_name}}" + } +] +``` + +### Prompt +The prompt serves as the initial input provided to the model to generate a response. +This property accepts either a string or a JSON chat format. + + + +```json Prompt as string +{ + "runs": [ + { + "prompt": "You are an SQLite expert who can convert natural language questions to SQL queries. What is the SQL query for this question: {{question}}" + } + ] +} +``` + +```json Prompt as JSON +{ + "runs": [ + { + "prompt": [ + { + "role": "system", + "content": "You are an SQLite expert who can convert natural language questions to SQL queries." + }, + { + "role": "user", + "content": "{{question}}" + } + ] + } + ] +} +``` + + + +#### Prompt format: string + +String prompts are wrapped in `user` role message before sending to the model. + +The [basic example](https://github.com/empirical-run/empirical/tree/main/examples/basic) uses this prompt +format to test extraction of named entities from natural language text. + +#### Prompt format: JSON + +The JSON chat format allows for a sequence of messages comprising the conversation so far. +Each message object has two required fields: +- `role`: Role of the messenger (either `system`, `user` or `assistant`) +- `content`: The content of the message + +The [Text-to-SQL example](https://github.com/empirical-run/empirical/tree/main/examples/spider) +uses this prompt format to test conversion of natural language questions to SQL queries. + +## Placeholders + +Define placeholders in the prompt with Handlebars syntax (like `{{user_name}}`) to inject values +from the dataset sample. These placeholders will be replaced with the corresponding input value +during execution. + +See [dataset](../dataset/basics) to learn more about sample inputs. + + +## Model parameters + +To override parameters like `temperature` or `max_tokens`, you can pass `parameters` along with the provider +configuration. All OpenAI parameters (see their [API reference](https://platform.openai.com/docs/api-reference/chat/create)) +are supported, except for a few [limitations](#limitations). + +For non-OpenAI models, we coerce these parameters to the most appropriate target parameter (e.g. `stop` in OpenAI +becomes `stop_sequences` for Anthropic.) + +You can add other parameters or override this behavior with [passthrough](#passthrough). + +```json empiricalrc.json +"runs": [ + { + "type": "model", + "provider": "openai", + "model": "gpt-3.5-turbo", + "prompt": "Hey I'm {{user_name}}", + "parameters": { + "temperature": 0.1 + } + } +] +``` + +### Tool calling + +Hosted models support tool calling. You can use the `tools` parameter to specify +functions that are provided to the model. + +See [output object](./output) to see how the model response object stores tool calls. + +```json empiricalrc.json +"runs": [ + { + "type": "model", + "provider": "openai", + "model": "gpt-4o", + "prompt": "Add these numbers {{numberOne}} and {{numberTwo}}", + "parameters": { + "tools": [ + { + "type": "function", + "function": { + "name": "add_numbers", + "description": "Helper function to add numbers", + "parameters": { + "type": "object", + "properties": { + "number_a": { + "type": "number", + "description": "The first number" + }, + "number_b": { + "type": "number", + "description": "The second number" + } + } + } + } + } + ] + } + } +] +``` + +### Passthrough + +If your models rely on other parameters, you can still specify them in the configuration. These +parameters will be passed as-is to the model. + +For example, Mistral models support a `safePrompt` parameter for [guard railing](https://docs.mistral.ai/platform/guardrailing/). + +```json empiricalrc.json +"runs": [ + { + "type": "model", + "provider": "mistral", + "model": "mistral-tiny", + "prompt": "Hey I'm {{user_name}}", + "parameters": { + "temperature": 0.1, + "safePrompt": true + } + } +] +``` + +### Request timeout + +You can set the timeout duration in milliseconds under model parameters in the `empiricalrc.json` file. This might be required for prompt completions that are expected to take more time, for example while running models like Claude Opus. If no specific value is assigned, the default timeout duration of 30 seconds will be applied. + +```json empiricalrc.json +"runs": [ + { + "type": "model", + "provider": "anthropic", + "model": "claude-3-opus", + "prompt": "Hey I'm {{user_name}}", + "parameters": { + "timeout": 10000 + } + } +] +``` diff --git a/docs/models/output.mdx b/docs/models/output.mdx new file mode 100644 index 00000000..c51580c6 --- /dev/null +++ b/docs/models/output.mdx @@ -0,0 +1,54 @@ +--- +title: 'Output object' +description: 'Reference for the output object structure' +--- + +All model types return an output object which has the following +properties. This object is made available to the scoring functions. + +## How to use this? + +- **Generation** + - Hosted models are pre-configured to respond with this object + - If you are using [Custom scripts](../models/custom), the `execute` method is + expected to return this object +- **Scoring**: If you are writing a [custom scorer in Python](../scoring/python), you can use this + as a reference for the `output` object sent to the scorer. + +## Output structure + + + The value from the model (for chat models, this is the message content from the + assistant's response.) + + + An arbitrary key (string), value (string) object that can be used to store additional metadata + (e.g. retrieved context from a RAG pipeline, or result of a pre-processing step). The + metadata is shown on the web reporter and can also be scored. + + + List of [tool call](#tool-call) objects in the response from the model. + + +## Tool call + +This mirrors the OpenAI tool call object. + + + A unique identifier for the tool call. + + + Equal to "function", since only one type is supported. + + + The [function](#function) object. + + +### Function + + + Name of the function + + + The arguments to call the function with in a string-ified JSON format + diff --git a/docs/models/providers.mdx b/docs/models/providers.mdx new file mode 100644 index 00000000..3951d3b9 --- /dev/null +++ b/docs/models/providers.mdx @@ -0,0 +1,85 @@ +--- +title: 'Supported providers' +description: 'LLM providers supported out-of-the-box' +--- + +Empirical supports a set of popular LLM inference providers out-of-the-box. These +can be specified as `provider` in the Empirical configuration file. + +## Supported providers + +| Provider | Description | +|----------|-------------| +| `openai` | All chat models are supported. Requires `OPENAI_API_KEY` environment variable. | +| `azure-openai` | All chat models from OpenAI that are hosted on Azure are supported. Requires `AZURE_OPENAI_API_KEY` and either of `AZURE_OPENAI_RESOURCE_NAME` or `AZURE_OPENAI_BASE_URL` environment variables. | +| `anthropic` | Claude 3 models are supported. Requires `ANTHROPIC_API_KEY` environment variable. | +| `mistral` | All chat models are supported. Requires `MISTRAL_API_KEY` environment variable. | +| `google` | Gemini Pro models are supported. Requires `GOOGLE_API_KEY` environment variable. | +| `fireworks` | Models hosted on Fireworks (e.g. `dbrx-instruct`) are supported. Requires `FIREWORKS_API_KEY` environment variable. | + + + + +#### Get API key + +- `AZURE_OPENAI_API_KEY`: This is the API key to authenticate with Azure. See [their docs](https://learn.microsoft.com/en-us/javascript/api/overview/azure/openai-readme?view=azure-node-preview#using-an-api-key-from-azure) to get the API key. + +#### Specify base url +You can specify the base URL of the Azure OpenAI endpoint by setting **either** one of the following environment variables: +- `AZURE_OPENAI_RESOURCE_NAME`: This the resource name which is used to create the endpoint base URL with the format `https://$AZURE_OPENAI_RESOURCE_NAME.openai.azure.com` +- `AZURE_OPENAI_BASE_URL`: This is if you want to specify the entire base URL used to access the chat completions API with the format `$AZURE_OPENAI_BASE_URL/openai/deployments//chat/completions`. For example - `https://some-custom-url.com` + +#### Model configuration + +In the configuration file, +- Set the `provider` to `azure-openai` +- Set `model` to the name of your model deployment + +#### Additional parameters + +- By default, the `api-version` parameter is set to "2024-02-15-preview". If you need to override this, set the `apiVersion` parameter + + +```json +"runs": [ + { + "type": "model", + "provider": "azure-openai", + "model": "gpt-35-deployment", + "prompt": "Hey I'm {{user_name}}", + "parameters": { + "apiVersion": "2024-02-15-preview" + } + } +] +``` + + + + +#### Get API key + +The [Google AI studio](https://aistudio.google.com/) is the easiest way to get API keys. Once you have the key, +set it as the `GOOGLE_API_KEY` environment variable. + +#### Supported models + +We support the Gemini model codes, as defined in the [official docs](https://ai.google.dev/models/gemini). + +- Gemini 1.5 Pro: set `model` to `gemini-1.5-pro-latest` +- Gemini 1 Pro: set `model` to `gemini-pro` or `gemini-1.0-pro` + + + + +## Environment variables + +API calls to model providers require API keys, which are stored as environment variables. The CLI can work with: + +- Existing environment variables (using `process.env`) +- Environment variables defined in `.env` or `.env.local` files, in the current working directory + - For .env files that are located elsewhere, you can pass the `--env-file` flag + +```sh +npx @empiricalrun/cli --env-file +``` \ No newline at end of file diff --git a/docs/package.json b/docs/package.json index 600d7ceb..b2a16b72 100644 --- a/docs/package.json +++ b/docs/package.json @@ -9,6 +9,6 @@ "author": "", "license": "ISC", "dependencies": { - "mintlify": "^4.0.152" + "mintlify": "^4.0.158" } } diff --git a/docs/pnpm-lock.yaml b/docs/pnpm-lock.yaml index 3f183d49..a116496e 100644 --- a/docs/pnpm-lock.yaml +++ b/docs/pnpm-lock.yaml @@ -6,8 +6,8 @@ settings: dependencies: mintlify: - specifier: ^4.0.152 - version: 4.0.152(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0) + specifier: ^4.0.158 + version: 4.0.158(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0) packages: @@ -281,17 +281,17 @@ packages: react: 18.2.0 dev: false - /@mintlify/cli@4.0.152(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-DaOSgfrD3r3tBY+KIyUnYsP/FS3PFRvx+97jIvGkkbrvngltwm7ELyGgqM+iHOhlszXDRFIjMFtJKkmYECsmOw==} + /@mintlify/cli@4.0.158(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-ydHRhm1JXbcq9iEopcBUD74QTg6+1i6l06xOx0CpkjhJHSL//Up3aTj66DfrDO9dDZrvLRB3T9guwzBcGGjU+A==} engines: {node: '>=18.0.0'} hasBin: true dependencies: '@apidevtools/swagger-parser': 10.1.0(openapi-types@12.1.3) - '@mintlify/link-rot': 3.0.162(react-dom@18.2.0)(react@18.2.0) - '@mintlify/models': 0.0.86 - '@mintlify/prebuild': 1.0.162(react-dom@18.2.0)(react@18.2.0) - '@mintlify/previewing': 4.0.149(react-dom@18.2.0)(react@18.2.0) - '@mintlify/validation': 0.1.146 + '@mintlify/link-rot': 3.0.166(react-dom@18.2.0)(react@18.2.0) + '@mintlify/models': 0.0.88 + '@mintlify/prebuild': 1.0.166(react-dom@18.2.0)(react@18.2.0) + '@mintlify/previewing': 4.0.155(react-dom@18.2.0)(react@18.2.0) + '@mintlify/validation': 0.1.150 chalk: 5.3.0 detect-port: 1.5.1 fs-extra: 11.2.0 @@ -310,13 +310,13 @@ packages: - utf-8-validate dev: false - /@mintlify/common@1.0.94(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-ngFbaFtzlLK3/1ihrkSpY1hM2uA4rGhuJ4d5i3LdUl1bZqzH5rL5URPaj/ylKhYM0qVFOwQzM6b877lT6Sa4Dw==} + /@mintlify/common@1.0.98(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-eUL2mKFxSzVnBWXFS4HEGchgdUhgiBj9zxTMQMcKSp11F75WD+QvGBFiwsatYZNtrg/id8LRDAyJUYZDDnUUGw==} dependencies: '@apidevtools/swagger-parser': 10.1.0(openapi-types@12.1.3) '@mintlify/mdx': 0.0.44(react-dom@18.2.0)(react@18.2.0) - '@mintlify/models': 0.0.86 - '@mintlify/validation': 0.1.146 + '@mintlify/models': 0.0.88 + '@mintlify/validation': 0.1.150 '@sindresorhus/slugify': 2.2.1 acorn: 8.11.3 acorn-jsx: 5.3.2(acorn@8.11.3) @@ -359,13 +359,13 @@ packages: - supports-color dev: false - /@mintlify/link-rot@3.0.162(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-cm4VjTAZdie1Y5g0bq9IfI9epEAaPcboQ0LjT42sVfYOhDkA+ymV3UU5kJoga+hMllsZOAAaUaaKzpbHyVC+rQ==} + /@mintlify/link-rot@3.0.166(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-38F0VMiifwepTsu0entSwfK8bpwXJxawY26qSgzq3sfTUiGp03izcq3nSQj2rhmcqx/NwEUkQX6aM3ZAkYY+Vw==} engines: {node: '>=18.0.0'} dependencies: '@apidevtools/swagger-parser': 10.1.0(openapi-types@12.1.3) - '@mintlify/common': 1.0.94(react-dom@18.2.0)(react@18.2.0) - '@mintlify/prebuild': 1.0.162(react-dom@18.2.0)(react@18.2.0) + '@mintlify/common': 1.0.98(react-dom@18.2.0)(react@18.2.0) + '@mintlify/prebuild': 1.0.166(react-dom@18.2.0)(react@18.2.0) chalk: 5.3.0 fs-extra: 11.2.0 gray-matter: 4.0.3 @@ -396,8 +396,8 @@ packages: - supports-color dev: false - /@mintlify/models@0.0.86: - resolution: {integrity: sha512-tC71RKOi5ywV6XA4w3Kgk4A7vtoxlwDamfJQwfd31mq+7OTSruvS6JY45DEh4yQLDj1jlUIo+X1OuDk/tE0rGQ==} + /@mintlify/models@0.0.88: + resolution: {integrity: sha512-QEQ64AzRPI1hjZvGC73J6jX1fyTHCTE8hclSgT3281dm2TqY1jT/LluH8kxqwZtH1OFZ5/caZfWKyVEQmNo0WA==} engines: {node: '>=18.0.0'} dependencies: axios: 1.6.8 @@ -406,12 +406,12 @@ packages: - debug dev: false - /@mintlify/prebuild@1.0.162(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-lAKJwMp8XP7OhdAU5A/xhu3IekvfvY454CB0uuXL+hepXQR549/1TvbQ6pGj3loacnLhE7iylTXKQ/Bsyc2eEA==} + /@mintlify/prebuild@1.0.166(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-OXU6dGYIEnPWUPY1y83FU/pR/T7sdSpDk4Lp1jtm76rfUY35pg6TmFOkFvq+0cTuSKO9FafL11fCwrvKmAVTXQ==} dependencies: '@apidevtools/swagger-parser': 10.1.0(openapi-types@12.1.3) - '@mintlify/common': 1.0.94(react-dom@18.2.0)(react@18.2.0) - '@mintlify/validation': 0.1.146 + '@mintlify/common': 1.0.98(react-dom@18.2.0)(react@18.2.0) + '@mintlify/validation': 0.1.150 favicons: 7.2.0 fs-extra: 11.2.0 gray-matter: 4.0.3 @@ -426,14 +426,14 @@ packages: - supports-color dev: false - /@mintlify/previewing@4.0.149(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-prAcU8HYOLgXjkmCatJto7RCiZLFAOxeUvhHvQtqMLlyeX/tjaI15+DyPDomlcGD4hVKqCYXt/FSgXBCGECppQ==} + /@mintlify/previewing@4.0.155(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-e4yrr34zHWKpSiUCu3Y40h6Nc2Lx0qvO2QMiRwqhIy2FKFL0Xk+DesMNDkn/Bb+xbph//POOhvyTpuvpNqlEVw==} engines: {node: '>=18.0.0'} dependencies: '@apidevtools/swagger-parser': 10.1.0(openapi-types@12.1.3) - '@mintlify/common': 1.0.94(react-dom@18.2.0)(react@18.2.0) - '@mintlify/prebuild': 1.0.162(react-dom@18.2.0)(react@18.2.0) - '@mintlify/validation': 0.1.146 + '@mintlify/common': 1.0.98(react-dom@18.2.0)(react@18.2.0) + '@mintlify/prebuild': 1.0.166(react-dom@18.2.0)(react@18.2.0) + '@mintlify/validation': 0.1.150 '@octokit/rest': 19.0.13 chalk: 5.3.0 chokidar: 3.6.0 @@ -460,10 +460,10 @@ packages: - utf-8-validate dev: false - /@mintlify/validation@0.1.146: - resolution: {integrity: sha512-MCM6/GzJh3aUbCrdLW1VmXDVbcuE3gvh6otM5bLoiChGMohx5mqBohFZa5fWo9UMvbI3nUkIuIPt+/h4dONKlg==} + /@mintlify/validation@0.1.150: + resolution: {integrity: sha512-NODG7aO/9t4sERXm+SKpA340Lc58zqFQ1f7lRrHlfEKin1Y2WYpmrnYIXKzQ9dWw2n/EUjdbW2mFdb5u16/Z3A==} dependencies: - '@mintlify/models': 0.0.86 + '@mintlify/models': 0.0.88 lcm: 0.0.3 lodash: 4.17.21 openapi-types: 12.1.3 @@ -2800,12 +2800,12 @@ packages: yallist: 4.0.0 dev: false - /mintlify@4.0.152(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0): - resolution: {integrity: sha512-W74WhBGoP8+dIbAwpHOsiZvSck1GtKpigDepiDsI/PnnKTJRpdloKV/uRNZA/GOtnS9QdQktsYrnMPeX0gQ2PQ==} + /mintlify@4.0.158(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0): + resolution: {integrity: sha512-n57IF4y/69xYuM7Je8vqvnEBwP0T+eojnPlcS0HO3YbaC5vAHCKOO1jSOXSbeftjf1nRqzzx4KczB/2TKaI8nQ==} engines: {node: '>=18.0.0'} hasBin: true dependencies: - '@mintlify/cli': 4.0.152(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0) + '@mintlify/cli': 4.0.158(openapi-types@12.1.3)(react-dom@18.2.0)(react@18.2.0) transitivePeerDependencies: - bufferutil - debug diff --git a/docs/reporter/basics.mdx b/docs/reporter.mdx similarity index 97% rename from docs/reporter/basics.mdx rename to docs/reporter.mdx index 3ba8db03..38fd8710 100644 --- a/docs/reporter/basics.mdx +++ b/docs/reporter.mdx @@ -1,6 +1,6 @@ --- -title: 'Basics' -description: 'How to view results of a test' +title: 'Reporter' +description: 'View test results in your terminal and a web browser' --- After executing the test, you can access the results through a reporter. diff --git a/examples/tool_calls/score.py b/examples/tool_calls/score.py index 2761a8a1..fa7a3526 100644 --- a/examples/tool_calls/score.py +++ b/examples/tool_calls/score.py @@ -2,6 +2,10 @@ def evaluate(output, inputs): + """ + Returns score=1 if the output has the expected tool calls by checking for their + expected name and parameters. Returns score=0 otherwise. + """ expected_output = inputs.get("expected_tool_call", "") if expected_output == "": return []