diff --git a/haystack_experimental/evaluation/__init__.py b/haystack_experimental/evaluation/__init__.py index 4546e497..b5119922 100644 --- a/haystack_experimental/evaluation/__init__.py +++ b/haystack_experimental/evaluation/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .harness import EvalRunOverrides, EvaluationHarness +from .harness import EvaluationHarness, EvaluationRunOverrides -_all_ = ["EvaluationHarness", "EvalRunOverrides"] +_all_ = ["EvaluationHarness", "EvaluationRunOverrides"] diff --git a/haystack_experimental/evaluation/harness/__init__.py b/haystack_experimental/evaluation/harness/__init__.py index 90792912..6d761a05 100644 --- a/haystack_experimental/evaluation/harness/__init__.py +++ b/haystack_experimental/evaluation/harness/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from .evalution_harness import EvalRunOverrides, EvaluationHarness +from .evalution_harness import EvaluationHarness, EvaluationRunOverrides -_all_ = ["EvaluationHarness", "EvalRunOverrides"] +_all_ = ["EvaluationHarness", "EvaluationRunOverrides"] diff --git a/haystack_experimental/evaluation/harness/evalution_harness.py b/haystack_experimental/evaluation/harness/evalution_harness.py index e5977ee5..f1015139 100644 --- a/haystack_experimental/evaluation/harness/evalution_harness.py +++ b/haystack_experimental/evaluation/harness/evalution_harness.py @@ -8,11 +8,10 @@ from haystack import Pipeline from haystack.core.serialization import DeserializationCallbacks -from haystack.evaluation.eval_run_result import BaseEvaluationRunResult @dataclass -class EvalRunOverrides: +class EvaluationRunOverrides: """ Overrides for an evaluation run. @@ -32,7 +31,7 @@ class EvalRunOverrides: EvalRunInputT = TypeVar("EvalRunInputT") -EvalRunOutputT = TypeVar("EvalRunOutputT", bound=BaseEvaluationRunResult) +EvalRunOutputT = TypeVar("EvalRunOutputT") EvalRunOverridesT = TypeVar("EvalRunOverridesT") @@ -43,9 +42,7 @@ class EvaluationHarness(ABC, Generic[EvalRunInputT, EvalRunOverridesT, EvalRunOu @staticmethod def _override_pipeline(pipeline: Pipeline, parameter_overrides: Optional[Dict[str, Any]]) -> Pipeline: - def component_pre_init_callback( - name: str, cls: Type, init_params: Dict[str, Any] - ): # pylint: disable=unused-argument + def component_pre_init_callback(name: str, cls: Type, init_params: Dict[str, Any]): # pylint: disable=unused-argument assert parameter_overrides is not None overrides = parameter_overrides.get(name) if overrides: @@ -70,7 +67,11 @@ def validate_overrides(): @abstractmethod def run( - self, inputs: EvalRunInputT, *, overrides: Optional[EvalRunOverridesT] = None, run_name: Optional[str] = None + self, + inputs: EvalRunInputT, + *, + overrides: Optional[EvalRunOverridesT] = None, + run_name: Optional[str] = None, ) -> EvalRunOutputT: """ Launch a evaluation run. diff --git a/haystack_experimental/evaluation/harness/rag/__init__.py b/haystack_experimental/evaluation/harness/rag/__init__.py new file mode 100644 index 00000000..cc714697 --- /dev/null +++ b/haystack_experimental/evaluation/harness/rag/__init__.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .harness import RAGEvaluationHarness +from .parameters import ( + RAGEvaluationInput, + RAGEvaluationMetric, + RAGEvaluationOutput, + RAGEvaluationOverrides, + RAGExpectedComponent, + RAGExpectedComponentMetadata, +) + +_all_ = [ + "RAGEvaluationHarness", + "RAGExpectedComponent", + "RAGExpectedComponentMetadata", + "RAGEvaluationMetric", + "RAGEvaluationOutput", + "RAGEvaluationOverrides", + "RAGEvaluationInput", +] diff --git a/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py new file mode 100644 index 00000000..1ba64bee --- /dev/null +++ b/haystack_experimental/evaluation/harness/rag/evaluation_pipeline.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from functools import partial +from typing import Set + +from haystack import Pipeline +from haystack.components.evaluators import ( + DocumentMAPEvaluator, + DocumentMRREvaluator, + DocumentRecallEvaluator, + FaithfulnessEvaluator, + SASEvaluator, +) +from haystack.components.evaluators.document_recall import RecallMode + +from .parameters import RAGEvaluationMetric + + +def default_rag_evaluation_pipeline( + metrics: Set[RAGEvaluationMetric], +) -> Pipeline: + """ + Builds the default evaluation pipeline for RAG. + + :param metrics: + The set of metrics to include in the pipeline. + :returns: + The evaluation pipeline. + """ + pipeline = Pipeline() + + metric_ctors = { + RAGEvaluationMetric.DOCUMENT_MAP: DocumentMAPEvaluator, + RAGEvaluationMetric.DOCUMENT_MRR: DocumentMRREvaluator, + RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.SINGLE_HIT), + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: partial(DocumentRecallEvaluator, mode=RecallMode.MULTI_HIT), + RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: partial( + SASEvaluator, model="sentence-transformers/all-MiniLM-L6-v2" + ), + RAGEvaluationMetric.ANSWER_FAITHFULNESS: FaithfulnessEvaluator, + } + + for metric in metrics: + ctor = metric_ctors[metric] + pipeline.add_component(metric.value, ctor()) + + return pipeline diff --git a/haystack_experimental/evaluation/harness/rag/harness.py b/haystack_experimental/evaluation/harness/rag/harness.py new file mode 100644 index 00000000..f0b51165 --- /dev/null +++ b/haystack_experimental/evaluation/harness/rag/harness.py @@ -0,0 +1,355 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from copy import deepcopy +from typing import Any, Dict, List, Optional, Set + +from haystack import Pipeline +from haystack.evaluation.eval_run_result import EvaluationRunResult + +from ...util.helpers import ( + aggregate_batched_pipeline_outputs, + deaggregate_batched_pipeline_inputs, +) +from ...util.pipeline_pair import PipelinePair +from ..evalution_harness import EvaluationHarness +from .evaluation_pipeline import default_rag_evaluation_pipeline +from .parameters import ( + RAGEvaluationInput, + RAGEvaluationMetric, + RAGEvaluationOutput, + RAGEvaluationOverrides, + RAGExpectedComponent, + RAGExpectedComponentMetadata, +) + + +class RAGEvaluationHarness(EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput]): + """ + Evaluation harness for evaluating RAG pipelines. + """ + + def __init__( + self, + rag_pipeline: Pipeline, + rag_components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata], + metrics: Set[RAGEvaluationMetric], + ): + """ + Create an evaluation harness for evaluating basic RAG pipelines. + + :param rag_pipeline: + The RAG pipeline to evaluate. + :param rag_components: + A mapping of expected components to their metadata. + :param metrics: + The metrics to use during evaluation. + """ + super().__init__() + + self._validate_rag_components(rag_pipeline, rag_components) + + self.rag_pipeline = rag_pipeline + self.rag_components = rag_components + self.metrics = metrics + self.evaluation_pipeline = default_rag_evaluation_pipeline(metrics) + + @classmethod + def default_with_embedding_retriever( + cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric] + ) -> "RAGEvaluationHarness": + """ + Create a default evaluation harness for evaluating RAG pipelines with a query embedder. + + :param rag_pipeline: + The RAG pipeline to evaluate. The following assumptions are made: + - The query embedder component is named 'query_embedder' and has a 'text' input. + - The document retriever component is named 'retriever' and has a 'documents' output. + - The response generator component is named 'generator' and has a 'replies' output. + :param metrics: + The metrics to use during evaluation. + """ + rag_components = { + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="query_embedder", input_mapping={"query": "text"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", output_mapping={"retrieved_documents": "documents"} + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="generator", output_mapping={"replies": "replies"} + ), + } + + return cls(rag_pipeline, rag_components, deepcopy(metrics)) + + @classmethod + def default_with_keyword_retriever( + cls, rag_pipeline: Pipeline, metrics: Set[RAGEvaluationMetric] + ) -> "RAGEvaluationHarness": + """ + Create a default evaluation harness for evaluating RAG pipelines with a keyword retriever. + + :param rag_pipeline: + The RAG pipeline to evaluate. The following assumptions are made: + - The document retriever component is named 'retriever' and has a 'query' input and a 'documents' output. + - The response generator component is named 'generator' and has a 'replies' output. + :param metrics: + The metrics to use during evaluation. + """ + rag_components = { + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="retriever", input_mapping={"query": "query"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", output_mapping={"retrieved_documents": "documents"} + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="generator", output_mapping={"replies": "replies"} + ), + } + + return cls(rag_pipeline, rag_components, deepcopy(metrics)) + + def run( # noqa: D102 + self, + inputs: RAGEvaluationInput, + *, + overrides: Optional[RAGEvaluationOverrides] = None, + run_name: Optional[str] = "RAG Evaluation", + ) -> RAGEvaluationOutput: + rag_inputs = self._prepare_rag_pipeline_inputs(inputs) + eval_inputs = self._prepare_eval_pipeline_additional_inputs(inputs) + pipeline_pair = self._generate_eval_run_pipelines(overrides) + + pipeline_outputs = pipeline_pair.run_first_as_batch(rag_inputs, eval_inputs) + rag_outputs, eval_outputs = ( + pipeline_outputs["first"], + pipeline_outputs["second"], + ) + + assert run_name is not None + run_results = EvaluationRunResult( + run_name, + inputs={ + "questions": inputs.queries, + "contexts": [ + [doc.content for doc in docs] + for docs in self._lookup_component_output( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + rag_outputs, + "retrieved_documents", + ) + ], + "responses": self._lookup_component_output( + RAGExpectedComponent.RESPONSE_GENERATOR, rag_outputs, "replies" + ), + }, + results=eval_outputs, + ) + + return RAGEvaluationOutput( + evaluated_pipeline=pipeline_pair.first.dumps(), + evaluation_pipeline=pipeline_pair.second.dumps(), + inputs=deepcopy(inputs), + results=run_results, + ) + + def _lookup_component_output( + self, + component: RAGExpectedComponent, + outputs: Dict[str, Dict[str, Any]], + output_name: str, + ) -> Any: + name = self.rag_components[component].name + mapping = self.rag_components[component].output_mapping + output_name = mapping[output_name] + return outputs[name][output_name] + + def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverrides]) -> PipelinePair: + if overrides is None: + rag_overrides = None + eval_overrides = None + else: + rag_overrides = overrides.rag_pipeline + eval_overrides = overrides.eval_pipeline + + if eval_overrides is not None: + for metric in eval_overrides.keys(): + if metric not in self.metrics: + raise ValueError(f"Cannot override parameters of unused evaluation metric '{metric.value}'") + + eval_overrides = {k.value: v for k, v in eval_overrides.items()} # type: ignore + + rag_pipeline = self._override_pipeline(self.rag_pipeline, rag_overrides) + eval_pipeline = self._override_pipeline(self.evaluation_pipeline, eval_overrides) # type: ignore + + return PipelinePair( + first=rag_pipeline, + second=eval_pipeline, + outputs_to_inputs=self._map_rag_eval_pipeline_io(), + map_first_outputs=lambda x: self._aggregate_rag_outputs( # pylint: disable=unnecessary-lambda + x + ), + included_first_outputs={ + RAGExpectedComponent.DOCUMENT_RETRIEVER.value, + RAGExpectedComponent.RESPONSE_GENERATOR.value, + }, + ) + + def _aggregate_rag_outputs(self, outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Dict[str, Any]]: + aggregate = aggregate_batched_pipeline_outputs(outputs) + + # We only care about the first response from the generator. + generator_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name + replies_output_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].output_mapping["replies"] + aggregate[generator_name][replies_output_name] = [r[0] for r in aggregate[generator_name][replies_output_name]] + + return aggregate + + def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]: + # We currently only have metric components in the eval pipeline. + # So, we just map those inputs to the outputs of the rag pipeline. + metric_inputs_to_component_outputs = { + RAGEvaluationMetric.DOCUMENT_MAP: { + "retrieved_documents": ( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + "retrieved_documents", + ) + }, + RAGEvaluationMetric.DOCUMENT_MRR: { + "retrieved_documents": ( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + "retrieved_documents", + ) + }, + RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT: { + "retrieved_documents": ( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + "retrieved_documents", + ) + }, + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: { + "retrieved_documents": ( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + "retrieved_documents", + ) + }, + RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: { + "predicted_answers": ( + RAGExpectedComponent.RESPONSE_GENERATOR, + "replies", + ) + }, + RAGEvaluationMetric.ANSWER_FAITHFULNESS: { + "contexts": ( + RAGExpectedComponent.DOCUMENT_RETRIEVER, + "retrieved_documents", + ), + "responses": (RAGExpectedComponent.RESPONSE_GENERATOR, "replies"), + }, + } + + outputs_to_inputs: Dict[str, List[str]] = {} + for metric in self.metrics: + io = metric_inputs_to_component_outputs[metric] + for metric_input_name, (component, component_output_name) in io.items(): + component_out = ( + f"{self.rag_components[component].name}." + f"{self.rag_components[component].output_mapping[component_output_name]}" + ) + metric_in = f"{metric.value}.{metric_input_name}" + if component_out not in outputs_to_inputs: + outputs_to_inputs[component_out] = [] + outputs_to_inputs[component_out].append(metric_in) + + return outputs_to_inputs + + def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[str, Dict[str, Any]]]: + query_embedder_name = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].name + query_embedder_text_input = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].input_mapping["query"] + + if inputs.additional_rag_inputs is not None: + # Ensure that the query embedder input is not provided as additional input. + existing = inputs.additional_rag_inputs.get(query_embedder_name) + if existing is not None: + existing = existing.get(query_embedder_text_input) # type: ignore + if existing is not None: + raise ValueError( + f"Query embedder input '{query_embedder_text_input}' cannot be provided as additional input." + ) + + # Add the queries as an aggregate input. + rag_inputs = deepcopy(inputs.additional_rag_inputs) + if query_embedder_name not in rag_inputs: + rag_inputs[query_embedder_name] = {} + rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(inputs.queries) + else: + rag_inputs = {query_embedder_name: {query_embedder_text_input: deepcopy(inputs.queries)}} + + separate_rag_inputs = deaggregate_batched_pipeline_inputs(rag_inputs) + return separate_rag_inputs + + def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) -> Dict[str, Dict[str, Any]]: + eval_inputs: Dict[str, Dict[str, List[Any]]] = {} + + for metric in self.metrics: + if metric in ( + RAGEvaluationMetric.DOCUMENT_MAP, + RAGEvaluationMetric.DOCUMENT_MRR, + RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT, + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT, + ): + if inputs.ground_truth_documents is None: + raise ValueError(f"Ground truth documents required for metric '{metric.value}'.") + if len(inputs.ground_truth_documents) != len(inputs.queries): + raise ValueError("Length of ground truth documents should match the number of queries.") + + eval_inputs[metric.value] = {"ground_truth_documents": inputs.ground_truth_documents} + elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY: + if inputs.ground_truth_answers is None: + raise ValueError(f"Ground truth answers required for metric '{metric.value}'.") + if len(inputs.ground_truth_answers) != len(inputs.queries): + raise ValueError("Length of ground truth answers should match the number of queries.") + + eval_inputs[metric.value] = {"ground_truth_answers": inputs.ground_truth_answers} + elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS: + eval_inputs[metric.value] = {"questions": inputs.queries} + + return eval_inputs + + @staticmethod + def _validate_rag_components( + pipeline: Pipeline, + components: Dict[RAGExpectedComponent, RAGExpectedComponentMetadata], + ): + for e in RAGExpectedComponent: + if e not in components: + raise ValueError(f"RAG evaluation harness requires metadata for the '{e.value}' component.") + + pipeline_outputs = pipeline.outputs(include_components_with_connected_outputs=True) + pipeline_inputs = pipeline.inputs(include_components_with_connected_inputs=True) + + for component, metadata in components.items(): + if metadata.name not in pipeline_outputs or metadata.name not in pipeline_inputs: + raise ValueError( + f"Expected '{component.value}' component named '{metadata.name}' not found in pipeline." + ) + + comp_inputs = pipeline_inputs[metadata.name] + comp_outputs = pipeline_outputs[metadata.name] + + for needle in metadata.input_mapping.values(): + if needle not in comp_inputs: + raise ValueError( + f"Required input '{needle}' not found in '{component.value}' " + f"component named '{metadata.name}'." + ) + + for needle in metadata.output_mapping.values(): + if needle not in comp_outputs: + raise ValueError( + f"Required output '{needle}' not found in '{component.value}' " + f"component named '{metadata.name}'." + ) diff --git a/haystack_experimental/evaluation/harness/rag/parameters.py b/haystack_experimental/evaluation/harness/rag/parameters.py new file mode 100644 index 00000000..4f5a4f7b --- /dev/null +++ b/haystack_experimental/evaluation/harness/rag/parameters.py @@ -0,0 +1,150 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, List, Optional + +from haystack import Document +from haystack.evaluation.eval_run_result import EvaluationRunResult + + +class RAGExpectedComponent(Enum): + """ + Represents the basic components in a RAG pipeline that needs to be present for evaluation. + + Each of these can be separate components in the pipeline or a single component that performs + multiple tasks. + """ + + #: The component in a RAG pipeline that accepts the user query. + #: Expected inputs: `query` - Name of input that contains the query string. + QUERY_PROCESSOR = "query_processor" + + #: The component in a RAG pipeline that retrieves documents based on the query. + #: Expected outputs: `retrieved_documents` - Name of output containing retrieved documents. + DOCUMENT_RETRIEVER = "document_retriever" + + #: The component in a RAG pipeline that generates responses based on the query and the retrieved documents. + #: Expected outputs: `replies` - Name of out containing the LLM responses. Only the first response is used. + RESPONSE_GENERATOR = "response_generator" + + +@dataclass(frozen=True) +class RAGExpectedComponentMetadata: + """ + Metadata for a `RAGExpectedComponent`. + + :param name: + Name of the component in the pipeline. + :param input_mapping: + Mapping of the expected inputs to + corresponding component input names. + :param output_mapping: + Mapping of the expected outputs to + corresponding component output names. + """ + + name: str + input_mapping: Dict[str, str] = field(default_factory=dict) + output_mapping: Dict[str, str] = field(default_factory=dict) + + +class RAGEvaluationMetric(Enum): + """ + Represents the metrics that can be used to evaluate a RAG pipeline. + """ + + #: Document Mean Average Precision. + DOCUMENT_MAP = "metric_doc_map" + + #: Document Mean Reciprocal Rank. + DOCUMENT_MRR = "metric_doc_mrr" + + #: Document Recall with a single hit. + DOCUMENT_RECALL_SINGLE_HIT = "metric_doc_recall_single" + + #: Document Recall with multiple hits. + DOCUMENT_RECALL_MULTI_HIT = "metric_doc_recall_multi" + + #: Semantic Answer Similarity. + SEMANTIC_ANSWER_SIMILARITY = "metric_sas" + + #: Answer Faithfulness. + ANSWER_FAITHFULNESS = "metric_answer_faithfulness" + + +@dataclass(frozen=True) +class RAGEvaluationInput: + """ + Input passed to the RAG evaluation harness. + + :param queries: + The queries passed to the RAG pipeline. + :param ground_truth_documents: + The ground truth documents passed to the + evaluation pipeline. Only required for metrics + that require them. + + Corresponds to the queries. + :param ground_truth_answers: + The ground truth answers passed to the + evaluation pipeline. Only required for metrics + that require them. + + Corresponds to the queries. + :param additional_rag_inputs: + Additional inputs to pass to the RAG pipeline. Each + key is the name of the component and its value a dictionary + with the input name and a list of values, each corresponding + to a query. + """ + + queries: List[str] + ground_truth_documents: Optional[List[List[Document]]] = None + ground_truth_answers: Optional[List[str]] = None + additional_rag_inputs: Optional[Dict[str, Dict[str, List[Any]]]] = None + + +@dataclass(frozen=True) +class RAGEvaluationOverrides: + """ + Overrides for a RAG evaluation run. + + Used to override the init parameters of components in + either (or both) the evaluated and evaluation pipelines. + + :param rag_pipeline: + Overrides for the RAG pipeline. Each + key is a component name and its value a dictionary + with init parameters to override. + :param eval_pipeline: + Overrides for the evaluation pipeline. Each + key is a RAG metric and its value a dictionary + with init parameters to override. + """ + + rag_pipeline: Optional[Dict[str, Dict[str, Any]]] = None + eval_pipeline: Optional[Dict[RAGEvaluationMetric, Dict[str, Any]]] = None + + +@dataclass(frozen=True) +class RAGEvaluationOutput: + """ + Represents the output of a RAG evaluation run. + + :param evaluated_pipeline: + Serialized version of the evaluated pipeline, including overrides. + :param evaluation_pipeline: + Serialized version of the evaluation pipeline, including overrides. + :param inputs: + Input passed to the evaluation harness. + :param results: + Results of the evaluation run. + """ + + evaluated_pipeline: str + evaluation_pipeline: str + inputs: RAGEvaluationInput + results: EvaluationRunResult diff --git a/pyproject.toml b/pyproject.toml index 01ee888b..85214d0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ dependencies = [ [tool.hatch.envs.test] extra-dependencies = [ - # List here dependencies behind lazy-import + # RAG evaluation harness (SAS evaluator) + "sentence-transformers>=2.2.0", ] [tool.hatch.envs.test.scripts] diff --git a/test/evaluation/harness/__init__.py b/test/evaluation/harness/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/test/evaluation/harness/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/evaluation/harness/rag/__init__.py b/test/evaluation/harness/rag/__init__.py new file mode 100644 index 00000000..c1764a6e --- /dev/null +++ b/test/evaluation/harness/rag/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/evaluation/harness/rag/test_harness.py b/test/evaluation/harness/rag/test_harness.py new file mode 100644 index 00000000..a9f45f97 --- /dev/null +++ b/test/evaluation/harness/rag/test_harness.py @@ -0,0 +1,560 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict, List, Optional +import pytest + +import random +from haystack_experimental.evaluation.harness.rag import ( + RAGEvaluationHarness, + RAGExpectedComponent, + RAGExpectedComponentMetadata, + RAGEvaluationMetric, + RAGEvaluationOverrides, + RAGEvaluationInput, +) +from haystack import Pipeline, component, Document, default_to_dict, default_from_dict +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.builders import PromptBuilder +from haystack.components.retrievers.in_memory import ( + InMemoryEmbeddingRetriever, + InMemoryBM25Retriever, +) +from haystack.components.generators import OpenAIGenerator +from haystack.utils import Secret + + +@component +class NonConformantComponent: + def __init__(self, inputs, outputs) -> None: + component.set_input_types(self, **inputs) + component.set_output_types(self, **outputs) + + def run(self, **kwargs): + return {} + + +@component +class MockGenerator: + def __init__(self, arg: int) -> None: + self.arg = arg + + def to_dict(self): + return default_to_dict(self, arg=self.arg) + + @classmethod + def from_dict(cls, data): + return default_from_dict(cls, data) + + @component.output_types(replies=List[str]) + def run(self, prompt: str) -> Dict[str, Any]: + return {"replies": ["placeholder"]} + + +@component +class MockKeywordRetriever: + def __init__(self) -> None: + self.counter = 0 + + @component.output_types(documents=List[Document]) + def run(self, query: str) -> Dict[str, Any]: + samples = [ + [Document(content="France")], + [ + Document(content="9th century"), + Document(content="10th century"), + Document(content="9th"), + ], + [ + Document(content="classical"), + Document(content="rock music"), + Document(content="dubstep"), + ], + [ + Document(content="11th"), + Document(content="the 11th"), + Document(content="11th century"), + ], + [ + Document(content="Denmark"), + Document(content="Norway"), + Document(content="Iceland"), + ], + [ + Document(content="10th century"), + Document(content="the first half of the 10th century"), + Document(content="10th"), + Document(content="10th"), + ], + ] + + idx = self.counter % len(samples) + self.counter += 1 + + return {"documents": samples[idx]} + + +def build_rag_pipeline_with_query_embedder( + embedder_name: str = "text_embedder", + embedder_component: Optional[Any] = None, + generator_name: str = "llm", + generator_component: Optional[Any] = None, +): + document_store = InMemoryDocumentStore() + retriever = InMemoryEmbeddingRetriever(document_store) + + if embedder_component: + text_embedder = embedder_component + else: + text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-MiniLM-L6-v2" + ) + template = """ + Given the following information, answer the question. + + Context: + {% for document in documents %} + {{ document.content }} + {% endfor %} + + Question: {{question}} + Answer: + """ + + prompt_builder = PromptBuilder(template=template) + + if generator_component: + generator = generator_component + else: + generator = OpenAIGenerator( + model="gpt-3.5-turbo", api_key=Secret.from_token("test_key") + ) + + pipeline = Pipeline() + pipeline.add_component(embedder_name, text_embedder) + pipeline.add_component("retriever", retriever) + pipeline.add_component("prompt_builder", prompt_builder) + pipeline.add_component(generator_name, generator) + pipeline.connect(f"{embedder_name}.embedding", "retriever.query_embedding") + pipeline.connect("retriever", "prompt_builder.documents") + pipeline.connect("prompt_builder", generator_name) + return pipeline + + +def build_rag_pipeline_with_keyword_retriever( + retriever_name: str = "retriever", + retriever_component: Optional[Any] = None, + retriever_output_name: str = "documents", + generator_name: str = "llm", + generator_component: Optional[Any] = None, +): + document_store = InMemoryDocumentStore() + if retriever_component: + retriever = retriever_component + else: + retriever = InMemoryBM25Retriever(document_store) + template = """ + Given the following information, answer the question. + + Context: + {% for document in documents %} + {{ document.content }} + {% endfor %} + + Question: {{question}} + Answer: + """ + + prompt_builder = PromptBuilder(template=template) + if generator_component: + generator = generator_component + else: + generator = OpenAIGenerator( + model="gpt-3.5-turbo", api_key=Secret.from_token("test_key") + ) + + pipeline = Pipeline() + pipeline.add_component(retriever_name, retriever) + pipeline.add_component("prompt_builder", prompt_builder) + pipeline.add_component(generator_name, generator) + pipeline.connect( + f"{retriever_name}.{retriever_output_name}", "prompt_builder.documents" + ) + pipeline.connect("prompt_builder", generator_name) + return pipeline + + +@pytest.fixture +def rag_pipeline(): + return build_rag_pipeline_with_query_embedder("text_embedder") + + +@pytest.fixture +def rag_pipeline_with_query_embedder(): + return build_rag_pipeline_with_query_embedder( + embedder_name="query_embedder", generator_name="generator" + ) + + +@pytest.fixture +def rag_pipeline_with_keyword_retriever(): + return build_rag_pipeline_with_keyword_retriever(generator_name="generator") + + +def test_rag_eval_harness_init(rag_pipeline): + harness = RAGEvaluationHarness( + rag_pipeline, + rag_components={ + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="text_embedder", input_mapping={"query": "text"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", output_mapping={"retrieved_documents": "documents"} + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="llm", output_mapping={"replies": "replies"} + ), + }, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_invalid_expected_component( + rag_pipeline, +): + with pytest.raises(ValueError, match="RAG evaluation harness requires metadata"): + _ = RAGEvaluationHarness( + rag_pipeline, + rag_components={}, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + with pytest.raises(ValueError, match="RAG evaluation harness requires metadata"): + _ = RAGEvaluationHarness( + rag_pipeline, + rag_components={ + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="text_embedder", input_mapping={"query": "text"} + ), + }, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_invalid_missing_components( + rag_pipeline, +): + with pytest.raises(ValueError, match="named 'embedder' not found in pipeline"): + _ = RAGEvaluationHarness( + rag_pipeline, + rag_components={ + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="embedder", input_mapping={"query": "text"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", + output_mapping={"retrieved_documents": "documents"}, + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="llm", output_mapping={"replies": "replies"} + ), + }, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_invalid_missing_inputs(rag_pipeline): + with pytest.raises( + ValueError, + match="Required input 'rando_input' not found in 'query_processor' component named 'text_embedder'", + ): + _ = RAGEvaluationHarness( + rag_pipeline, + rag_components={ + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="text_embedder", input_mapping={"query": "rando_input"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", + output_mapping={"retrieved_documents": "documents"}, + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="llm", output_mapping={"replies": "replies"} + ), + }, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_invalid_missing_outputs( + rag_pipeline, +): + with pytest.raises( + ValueError, + match="Required output 'rando_output' not found in 'response_generator' component named 'llm'", + ): + _ = RAGEvaluationHarness( + rag_pipeline, + rag_components={ + RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata( + name="text_embedder", input_mapping={"query": "text"} + ), + RAGExpectedComponent.DOCUMENT_RETRIEVER: RAGExpectedComponentMetadata( + name="retriever", + output_mapping={"retrieved_documents": "documents"}, + ), + RAGExpectedComponent.RESPONSE_GENERATOR: RAGExpectedComponentMetadata( + name="llm", output_mapping={"replies": "rando_output"} + ), + }, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_defaults( + rag_pipeline_with_query_embedder, rag_pipeline_with_keyword_retriever +): + _ = RAGEvaluationHarness.default_with_embedding_retriever( + rag_pipeline_with_query_embedder, metrics={RAGEvaluationMetric.DOCUMENT_MAP} + ) + + _ = RAGEvaluationHarness.default_with_keyword_retriever( + rag_pipeline_with_keyword_retriever, metrics={RAGEvaluationMetric.DOCUMENT_MAP} + ) + + +def test_rag_eval_harness_init_defaults_invalid_missing_inputs(): + with pytest.raises( + ValueError, + match="Required input 'text' not found in 'query_processor' component named 'query_embedder'", + ): + _ = RAGEvaluationHarness.default_with_embedding_retriever( + build_rag_pipeline_with_query_embedder( + embedder_name="llm", generator_name="query_embedder" + ), + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + with pytest.raises( + ValueError, + match="Required input 'query' not found in 'query_processor' component named 'retriever'", + ): + _ = RAGEvaluationHarness.default_with_keyword_retriever( + build_rag_pipeline_with_keyword_retriever( + retriever_name="llm", generator_name="retriever" + ), + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_init_defaults_invalid_missing_outputs(): + non_conformant_query_embedder_pipeline = build_rag_pipeline_with_query_embedder( + embedder_name="query_embedder", + generator_name="generator", + generator_component=NonConformantComponent( + {"prompt": str}, {"responses": List[str]} + ), + ) + non_conformant_keyword_retriever_pipeline = ( + build_rag_pipeline_with_keyword_retriever( + retriever_component=NonConformantComponent( + {"query": str}, {"docs": List[Document]} + ), + retriever_output_name="docs", + ) + ) + + with pytest.raises( + ValueError, + match="Required output 'replies' not found in 'response_generator' component named 'generator'", + ): + _ = RAGEvaluationHarness.default_with_embedding_retriever( + non_conformant_query_embedder_pipeline, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + with pytest.raises( + ValueError, + match="Required output 'documents' not found in 'document_retriever' component named 'retriever'", + ): + _ = RAGEvaluationHarness.default_with_keyword_retriever( + non_conformant_keyword_retriever_pipeline, + metrics={RAGEvaluationMetric.DOCUMENT_MAP}, + ) + + +def test_rag_eval_harness_run_invalid_ground_truths(rag_pipeline_with_query_embedder): + harness_map = RAGEvaluationHarness.default_with_embedding_retriever( + rag_pipeline_with_query_embedder, + metrics={ + RAGEvaluationMetric.DOCUMENT_MAP, + }, + ) + harness_sas = RAGEvaluationHarness.default_with_embedding_retriever( + rag_pipeline_with_query_embedder, + metrics={ + RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY, + }, + ) + + input_no_gt_docs = RAGEvaluationInput(queries=["What is the capital of France?"]) + input_mismatching_gt_docs = RAGEvaluationInput( + queries=["What is the capital of France?"], ground_truth_documents=[] + ) + input_no_gt_answers = RAGEvaluationInput( + queries=["What is the capital of France?"], + ground_truth_documents=[[Document(content="Paris is the capital of France.")]], + ) + input_mismatching_gt_answers = RAGEvaluationInput( + queries=["What is the capital of France?"], + ground_truth_documents=[[Document(content="Paris is the capital of France.")]], + ground_truth_answers=[], + ) + + with pytest.raises(ValueError, match="Ground truth documents required"): + _ = harness_map.run(input_no_gt_docs) + + with pytest.raises( + ValueError, + match="Length of ground truth documents should match the number of queries", + ): + _ = harness_map.run(input_mismatching_gt_docs) + + with pytest.raises(ValueError, match="Ground truth answers required"): + _ = harness_sas.run(input_no_gt_answers) + + with pytest.raises( + ValueError, + match="Length of ground truth answers should match the number of queries", + ): + _ = harness_sas.run(input_mismatching_gt_answers) + + +def test_rag_eval_harness_run_invalid_additional_input( + rag_pipeline_with_query_embedder, +): + harness = RAGEvaluationHarness.default_with_embedding_retriever( + rag_pipeline_with_query_embedder, + metrics={ + RAGEvaluationMetric.DOCUMENT_MAP, + }, + ) + + input = RAGEvaluationInput( + queries=["What is the capital of France?"], + ground_truth_documents=[[Document(content="Paris is the capital of France.")]], + additional_rag_inputs={"query_embedder": {"text": ["Some other question?"]}}, + ) + + with pytest.raises( + ValueError, + match="Query embedder input 'text' cannot be provided as additional input", + ): + _ = harness.run(input) + + +def test_rag_eval_harness_run_invalid_override( + rag_pipeline_with_query_embedder, +): + harness = RAGEvaluationHarness.default_with_embedding_retriever( + rag_pipeline_with_query_embedder, + metrics={ + RAGEvaluationMetric.DOCUMENT_MAP, + }, + ) + + input = RAGEvaluationInput( + queries=["What is the capital of France?"], + ground_truth_documents=[[Document(content="Paris is the capital of France.")]], + ) + + with pytest.raises( + ValueError, + match="Cannot override non-existent component 'rando_component'", + ): + _ = harness.run( + input, + overrides=RAGEvaluationOverrides( + rag_pipeline={"rando_component": {"Some": "thing"}} + ), + ) + + with pytest.raises( + ValueError, + match="Cannot override parameters of unused evaluation metric", + ): + _ = harness.run( + input, + overrides=RAGEvaluationOverrides( + eval_pipeline={ + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT: { + "mode": "single_hit" + } + } + ), + ) + + +def test_rag_eval_harness_run_statistical_metrics(): + harness = RAGEvaluationHarness.default_with_keyword_retriever( + build_rag_pipeline_with_keyword_retriever( + retriever_component=MockKeywordRetriever(), + generator_component=MockGenerator(arg=0), + generator_name="generator", + ), + metrics={ + RAGEvaluationMetric.DOCUMENT_MAP, + RAGEvaluationMetric.DOCUMENT_MRR, + RAGEvaluationMetric.DOCUMENT_RECALL_SINGLE_HIT, + RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT, + }, + ) + + inputs = RAGEvaluationInput( + queries=["What is the capital of France?"] * 6, + ground_truth_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="9th")], + [Document(content="classical music"), Document(content="classical")], + [Document(content="11th century"), Document(content="the 11th")], + [Document(content="Denmark, Iceland and Norway")], + [Document(content="10th century"), Document(content="10th")], + ], + ) + + output = harness.run( + inputs, + overrides=RAGEvaluationOverrides( + rag_pipeline={ + "generator": {"arg": 100}, + } + ), + run_name="test_run", + ) + + assert output.inputs == inputs + assert output.results.run_name == "test_run" + assert output.results.results == { + "metric_doc_map": { + "score": 0.7222222222222222, + "individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0], + }, + "metric_doc_recall_single": { + "score": 0.8333333333333334, + "individual_scores": [1.0, 1.0, 1.0, 1.0, 0.0, 1.0], + }, + "metric_doc_recall_multi": { + "score": 0.75, + "individual_scores": [1.0, 1.0, 0.5, 1.0, 0.0, 1.0], + }, + "metric_doc_mrr": { + "score": 0.75, + "individual_scores": [1.0, 1.0, 1.0, 0.5, 0.0, 1.0], + }, + } + overriden_pipeline_dict = Pipeline.loads(output.evaluated_pipeline).to_dict() + assert ( + overriden_pipeline_dict["components"]["generator"]["init_parameters"]["arg"] + == 100 + ) diff --git a/test/test_experimental.py b/test/test_experimental.py deleted file mode 100644 index f1748238..00000000 --- a/test/test_experimental.py +++ /dev/null @@ -1,2 +0,0 @@ -def test(): - pass