From 4c8175ffd3dd1ac80ed872c4fcc8c445f55b36fa Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Sun, 15 Sep 2024 18:27:30 +0300 Subject: [PATCH] Update evaluate directory reference Signed-off-by: Emmanuel Ferdman --- docs/README.md | 6 +++--- docs/user_guides/llm-support.md | 2 +- nemoguardrails/evaluate/cli/evaluate.py | 12 ++++++------ nemoguardrails/evaluate/data/topical/README.md | 6 +++--- nemoguardrails/evaluate/evaluate_moderation.py | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/README.md b/docs/README.md index 768c25e29..b879865e5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -55,9 +55,9 @@ NeMo Guardrails provides a set of CLI evaluation tools and experimental results There are also detailed guides on how to reproduce results and create datasets for the evaluation of each type of rail. * [Evaluation Tools and Results](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/eval): General explanation for the CLI evaluation tools and experimental results. -* [Topical Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/eval/data/topical/README.md): Dataset tools and details to run experiments for topical rails. -* [Fact-checking Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/eval/data/factchecking/README.md): Dataset tools and details to run experiments for fact-checking execution rail. -* [Moderation Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/eval/data/moderation/README.md): Dataset tools and details to run experiments for moderation execution rail. +* [Topical Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/evaluate/data/topical/README.md): Dataset tools and details to run experiments for topical rails. +* [Fact-checking Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/evaluate/data/factchecking/README.md): Dataset tools and details to run experiments for fact-checking execution rail. +* [Moderation Rail Evaluation - Dataset Tools](https://github.com/NVIDIA/NeMo-Guardrails/blob/develop/nemoguardrails/evaluate/data/moderation/README.md): Dataset tools and details to run experiments for moderation execution rail. ## Advanced Guides diff --git a/docs/user_guides/llm-support.md b/docs/user_guides/llm-support.md index 9c2a78cce..5f176661f 100644 --- a/docs/user_guides/llm-support.md +++ b/docs/user_guides/llm-support.md @@ -47,4 +47,4 @@ Table legend: The performance numbers reported in the table above for each LLM-feature pair are as follows: - the banking dataset evaluation for dialog (topical) rails - fact-checking using MSMARCO dataset and moderation rails experiments -More details in the [evaluation docs](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/eval/README.md). +More details in the [evaluation docs](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/evaluate/README.md). diff --git a/nemoguardrails/evaluate/cli/evaluate.py b/nemoguardrails/evaluate/cli/evaluate.py index 14947bb67..55bc12046 100644 --- a/nemoguardrails/evaluate/cli/evaluate.py +++ b/nemoguardrails/evaluate/cli/evaluate.py @@ -122,7 +122,7 @@ def moderation( help="The path to the guardrails config.", default="config" ), dataset_path: str = typer.Option( - "nemoguardrails/eval/data/moderation/harmful.txt", + "nemoguardrails/evaluate/data/moderation/harmful.txt", help="Path to dataset containing prompts", ), num_samples: int = typer.Option(50, help="Number of samples to evaluate"), @@ -142,7 +142,7 @@ def moderation( Args: config (str): The path to the guardrails config. Defaults to "config". dataset_path (str): Path to the dataset containing prompts. - Defaults to "nemoguardrails/eval/data/moderation/harmful.txt". + Defaults to "nemoguardrails/evaluate/data/moderation/harmful.txt". num_samples (int): Number of samples to evaluate. Defaults to 50. check_input (bool): Evaluate the input self-check rail. Defaults to True. check_output (bool): Evaluate the output self-check rail. Defaults to True. @@ -171,7 +171,7 @@ def hallucination( help="The path to the guardrails config.", default="config" ), dataset_path: str = typer.Option( - "nemoguardrails/eval/data/hallucination/sample.txt", help="Dataset path" + "nemoguardrails/evaluate/data/hallucination/sample.txt", help="Dataset path" ), num_samples: int = typer.Option(50, help="Number of samples to evaluate"), output_dir: str = typer.Option( @@ -186,7 +186,7 @@ def hallucination( Args: config (str): The path to the guardrails config. Defaults to "config". - dataset_path (str): Dataset path. Defaults to "nemoguardrails/eval/data/hallucination/sample.txt". + dataset_path (str): Dataset path. Defaults to "nemoguardrails/evaluate/data/hallucination/sample.txt". num_samples (int): Number of samples to evaluate. Defaults to 50. output_dir (str): Output directory. Defaults to "eval_outputs/hallucination". write_outputs (bool): Write outputs to file. Defaults to True. @@ -208,7 +208,7 @@ def fact_checking( help="The path to the guardrails config.", default="config" ), dataset_path: str = typer.Option( - "nemoguardrails/eval/data/factchecking/sample.json", + "nemoguardrails/evaluate/data/factchecking/sample.json", help="Path to the folder containing the dataset", ), num_samples: int = typer.Option(50, help="Number of samples to be evaluated"), @@ -231,7 +231,7 @@ def fact_checking( Args: config (str): The path to the guardrails config. Defaults to "config". - dataset_path (str): Path to the folder containing the dataset. Defaults to "nemoguardrails/eval/data/factchecking/sample.json". + dataset_path (str): Path to the folder containing the dataset. Defaults to "nemoguardrails/evaluate/data/factchecking/sample.json". num_samples (int): Number of samples to be evaluated. Defaults to 50. create_negatives (bool): Create synthetic negative samples. Defaults to True. output_dir (str): Path to the folder where the outputs will be written. Defaults to "eval_outputs/factchecking". diff --git a/nemoguardrails/evaluate/data/topical/README.md b/nemoguardrails/evaluate/data/topical/README.md index 295158a06..fdb2b9e71 100644 --- a/nemoguardrails/evaluate/data/topical/README.md +++ b/nemoguardrails/evaluate/data/topical/README.md @@ -41,7 +41,7 @@ This will take into account the mapping file above. To achieve this follow the n 1. Download the user intents file from the original dataset repository from [here](https://github.com/rahul051296/small-talk-rasa-stack/blob/master/data/nlu.md). 2. Move it to the `nemoguardrails/eval/data/topical/chitchat/original_dataset` folder. -3. Run the conversion script `nemoguardrails/eval/data/topical/create_colang_intent_file.py --dataset-name=chitchat --dataset-path=./chitchat/original_dataset/` +3. Run the conversion script `nemoguardrails/evaluate/data/topical/create_colang_intent_file.py --dataset-name=chitchat --dataset-path=./chitchat/original_dataset/` 4. The last step will create a `user.co` Colang file in the configured Guardrails app. To run the topical evaluation on this dataset run: @@ -62,7 +62,7 @@ This will take into account the mapping file above. To achieve this follow the n 1. Download the user intents files from the original dataset repository from [here](https://github.com/PolyAI-LDN/task-specific-datasets/tree/master/banking_data) (bot train and test). 2. Move the two files to the `./nemoguardrails/eval/data/topical/banking/original_dataset` folder. -3. Run the conversion script `./nemoguardrails/eval/data/topical/create_colang_intent_file.py --dataset-name=banking --dataset-path=./banking/original_dataset/` +3. Run the conversion script `./nemoguardrails/evaluate/data/topical/create_colang_intent_file.py --dataset-name=banking --dataset-path=./banking/original_dataset/` 4. The last step will create a `user.co` Colang file in the configured Guardrails app. To run the topical evaluation on this dataset run: @@ -71,7 +71,7 @@ To run the topical evaluation on this dataset run: ## Experiment with a new NLU dataset -If you want to assess the performance of topical rails with a new NLU dataset, you can use the `./nemoguardrails/eval/data/topical/dataset_tools.py` functionality. +If you want to assess the performance of topical rails with a new NLU dataset, you can use the `./nemoguardrails/evaluate/data/topical/dataset_tools.py` functionality. For each dataset, you need to define a new class that extends the `DatasetConnector` class and implements the two following two functions: - `read_dataset`: Reads the dataset from the specified path, instantiating at least intent names, intent canonical forms, and intent samples. The path received as parameter should contain the original dataset files, in the specific format they were distributed. diff --git a/nemoguardrails/evaluate/evaluate_moderation.py b/nemoguardrails/evaluate/evaluate_moderation.py index 7b9d0fe05..477c5e352 100644 --- a/nemoguardrails/evaluate/evaluate_moderation.py +++ b/nemoguardrails/evaluate/evaluate_moderation.py @@ -35,7 +35,7 @@ class ModerationRailsEvaluation: def __init__( self, config: str, - dataset_path: str = "nemoguardrails/nemoguardrails/eval/data/moderation/harmful.txt", + dataset_path: str = "nemoguardrails/nemoguardrails/evaluate/data/moderation/harmful.txt", num_samples: int = 50, check_input: bool = True, check_output: bool = True,