Merge branch 'main' into feat/fastembed-colbert-reranker

xoaryaa · web-flow · commit 7eeeec05225c · 2025-10-30T14:34:51.000+05:30
diff --git a/.github/workflows/deepeval.yml b/.github/workflows/deepeval.yml
@@ -32,7 +32,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: ["3.9", "3.13"]
+        python-version: ["3.10", "3.13"]
 
     steps:
       - name: Support longpaths
@@ -50,11 +50,11 @@ jobs:
       - name: Install Hatch
         run: pip install --upgrade hatch
       - name: Lint
-        if: matrix.python-version == '3.9' && runner.os == 'Linux'
+        if: matrix.python-version == '3.10' && runner.os == 'Linux'
         run: hatch run fmt-check && hatch run test:types
 
       - name: Generate docs
-        if: matrix.python-version == '3.9' && runner.os == 'Linux'
+        if: matrix.python-version == '3.10' && runner.os == 'Linux'
         run: hatch run docs
 
       - name: Run tests
diff --git a/integrations/deepeval/pyproject.toml b/integrations/deepeval/pyproject.toml
@@ -5,17 +5,16 @@ build-backend = "hatchling.build"
 [project]
 name = "deepeval-haystack"
 dynamic = ["version"]
-description = 'An integration of DeepEvla LLM evaluation framework with Haystack'
+description = 'An integration of DeepEval LLM evaluation framework with Haystack'
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = "Apache-2.0"
 keywords = []
 authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
 classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Development Status :: 4 - Beta",
   "Programming Language :: Python",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
@@ -77,7 +76,7 @@ module = ["deepeval.*"]
 ignore_missing_imports = true
 
 [tool.ruff]
-target-version = "py38"
+target-version = "py310"
 line-length = 120
 
 [tool.ruff.lint]
diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py
@@ -1,5 +1,6 @@
 import json
-from typing import Any, Callable, Dict, List, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 from haystack import DeserializationError, component, default_from_dict, default_to_dict
 
@@ -47,12 +48,12 @@ class DeepEvalEvaluator:
 
     _backend_metric: BaseMetric
     # Wrapped for easy mocking.
-    _backend_callable: Callable[[List[LLMTestCase], BaseMetric], EvaluationResult]
+    _backend_callable: Callable[[list[LLMTestCase], BaseMetric], EvaluationResult]
 
     def __init__(
         self,
-        metric: Union[str, DeepEvalMetric],
-        metric_params: Optional[Dict[str, Any]] = None,
+        metric: str | DeepEvalMetric,
+        metric_params: dict[str, Any] | None = None,
     ):
         """
         Construct a new DeepEval evaluator.
@@ -72,8 +73,8 @@ def __init__(
         expected_inputs = self.descriptor.input_parameters
         component.set_input_types(self, **expected_inputs)
 
-    @component.output_types(results=List[List[Dict[str, Any]]])
-    def run(self, **inputs: Any) -> Dict[str, Any]:
+    @component.output_types(results=list[list[dict[str, Any]]])
+    def run(self, **inputs: Any) -> dict[str, Any]:
         """
         Run the DeepEval evaluator on the provided inputs.
 
@@ -91,7 +92,7 @@ def run(self, **inputs: Any) -> Dict[str, Any]:
             - `explanation` - An optional explanation of the score.
         """
         InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs)
-        converted_inputs: List[LLMTestCase] = list(self.descriptor.input_converter(**inputs))  # type: ignore
+        converted_inputs: list[LLMTestCase] = list(self.descriptor.input_converter(**inputs))  # type: ignore
 
         results = self._backend_callable(converted_inputs, self._backend_metric)
         converted_results = [
@@ -100,7 +101,7 @@ def run(self, **inputs: Any) -> Dict[str, Any]:
 
         return {"results": converted_results}
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes the component to a dictionary.
 
@@ -128,7 +129,7 @@ def check_serializable(obj: Any) -> bool:
         )
 
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator":
+    def from_dict(cls, data: dict[str, Any]) -> "DeepEvalEvaluator":
         """
         Deserializes the component from a dictionary.
 
@@ -140,7 +141,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator":
         return default_from_dict(cls, data)
 
     @staticmethod
-    def _invoke_deepeval(test_cases: List[LLMTestCase], metric: BaseMetric) -> EvaluationResult:
+    def _invoke_deepeval(test_cases: list[LLMTestCase], metric: BaseMetric) -> EvaluationResult:
         return evaluate(test_cases=test_cases, metrics=[metric])
 
     def _init_backend(self):
diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py
@@ -1,9 +1,10 @@
 import dataclasses
 import inspect
+from collections.abc import Callable, Iterable, Mapping
 from dataclasses import dataclass
 from enum import Enum
 from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Type
+from typing import Any
 
 from deepeval.evaluate.types import TestResult
 from deepeval.metrics import (
@@ -83,8 +84,8 @@ class MetricResult:
     """
 
     name: str
-    score: Optional[float] = None
-    explanation: Optional[str] = None
+    score: float | None = None
+    explanation: str | None = None
 
     def to_dict(self):
         return dataclasses.asdict(self)
@@ -112,21 +113,21 @@ class MetricDescriptor:
     """
 
     metric: DeepEvalMetric
-    backend: Type[BaseMetric]
-    input_parameters: Dict[str, Type]
+    backend: type[BaseMetric]
+    input_parameters: dict[str, type]
     input_converter: Callable[[Any], Iterable[LLMTestCase]]
-    output_converter: Callable[[TestResult], List[MetricResult]]
-    init_parameters: Optional[Mapping[str, Type]] = None
+    output_converter: Callable[[TestResult], list[MetricResult]]
+    init_parameters: Mapping[str, type] | None = None
 
     @classmethod
     def new(
         cls,
         metric: DeepEvalMetric,
-        backend: Type[BaseMetric],
+        backend: type[BaseMetric],
         input_converter: Callable[[Any], Iterable[LLMTestCase]],
-        output_converter: Optional[Callable[[TestResult], List[MetricResult]]] = None,
+        output_converter: Callable[[TestResult], list[MetricResult]] | None = None,
         *,
-        init_parameters: Optional[Mapping[str, Type]] = None,
+        init_parameters: Mapping[str, type] | None = None,
     ) -> "MetricDescriptor":
         input_converter_signature = inspect.signature(input_converter)
         input_parameters = {}
@@ -175,27 +176,27 @@ def _validate_input_elements(**kwargs):
             raise ValueError(msg)
 
     @staticmethod
-    def validate_input_parameters(metric: DeepEvalMetric, expected: Dict[str, Any], received: Dict[str, Any]) -> None:
+    def validate_input_parameters(metric: DeepEvalMetric, expected: dict[str, Any], received: dict[str, Any]) -> None:
         for param, _ in expected.items():
             if param not in received:
                 msg = f"DeepEval evaluator expected input parameter '{param}' for metric '{metric}'"
                 raise ValueError(msg)
 
     @staticmethod
     def question_context_response(
-        questions: List[str], contexts: List[List[str]], responses: List[str]
+        questions: list[str], contexts: list[list[str]], responses: list[str]
     ) -> Iterable[LLMTestCase]:
         InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
-        for q, c, r in zip(questions, contexts, responses):  # type: ignore
+        for q, c, r in zip(questions, contexts, responses, strict=True):  # type: ignore
             test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c)
             yield test_case
 
     @staticmethod
     def question_context_response_ground_truth(
-        questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]
+        questions: list[str], contexts: list[list[str]], responses: list[str], ground_truths: list[str]
     ) -> Iterable[LLMTestCase]:
         InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
-        for q, c, r, gt in zip(questions, contexts, responses, ground_truths):  # type: ignore
+        for q, c, r, gt in zip(questions, contexts, responses, ground_truths, strict=True):  # type: ignore
             test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c, expected_output=gt)
             yield test_case
 
@@ -210,8 +211,8 @@ class OutputConverters:
     @staticmethod
     def default(
         metric: DeepEvalMetric,
-    ) -> Callable[[TestResult], List[MetricResult]]:
-        def inner(output: TestResult, metric: DeepEvalMetric) -> List[MetricResult]:
+    ) -> Callable[[TestResult], list[MetricResult]]:
+        def inner(output: TestResult, metric: DeepEvalMetric) -> list[MetricResult]:
             metric_name = str(metric)
             assert output.metrics_data
             assert len(output.metrics_data) == 1
@@ -227,30 +228,30 @@ def inner(output: TestResult, metric: DeepEvalMetric) -> List[MetricResult]:
         DeepEvalMetric.ANSWER_RELEVANCY,
         AnswerRelevancyMetric,
         InputConverters.question_context_response,  # type: ignore
-        init_parameters={"model": Optional[str]},  # type: ignore
+        init_parameters={"model": str | None},  # type: ignore
     ),
     DeepEvalMetric.FAITHFULNESS: MetricDescriptor.new(
         DeepEvalMetric.FAITHFULNESS,
         FaithfulnessMetric,
         InputConverters.question_context_response,  # type: ignore
-        init_parameters={"model": Optional[str]},  # type: ignore
+        init_parameters={"model": str | None},  # type: ignore
     ),
     DeepEvalMetric.CONTEXTUAL_PRECISION: MetricDescriptor.new(
         DeepEvalMetric.CONTEXTUAL_PRECISION,
         ContextualPrecisionMetric,
         InputConverters.question_context_response_ground_truth,  # type: ignore
-        init_parameters={"model": Optional[str]},  # type: ignore
+        init_parameters={"model": str | None},  # type: ignore
     ),
     DeepEvalMetric.CONTEXTUAL_RECALL: MetricDescriptor.new(
         DeepEvalMetric.CONTEXTUAL_RECALL,
         ContextualRecallMetric,
         InputConverters.question_context_response_ground_truth,  # type: ignore
-        init_parameters={"model": Optional[str]},  # type: ignore
+        init_parameters={"model": str | None},  # type: ignore
     ),
     DeepEvalMetric.CONTEXTUAL_RELEVANCE: MetricDescriptor.new(
         DeepEvalMetric.CONTEXTUAL_RELEVANCE,
         ContextualRelevancyMetric,
         InputConverters.question_context_response,  # type: ignore
-        init_parameters={"model": Optional[str]},  # type: ignore
+        init_parameters={"model": str | None},  # type: ignore
     ),
 }
diff --git a/integrations/deepeval/tests/test_evaluator.py b/integrations/deepeval/tests/test_evaluator.py
@@ -1,7 +1,6 @@
 import copy
 import os
 from dataclasses import dataclass
-from typing import Dict, Optional
 
 import pytest
 from deepeval.evaluate.types import EvaluationResult, TestResult
@@ -46,8 +45,8 @@ class Unserializable:
 @dataclass(frozen=True)
 class MockResult:
     score: float
-    reason: Optional[str] = None
-    score_breakdown: Optional[Dict[str, float]] = None
+    reason: str | None = None
+    score_breakdown: dict[str, float] | None = None
 
 
 # Only returns results for the passed metrics.
@@ -273,7 +272,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params, monk
     assert isinstance(results, type(expected_outputs))
     assert len(results) == len(expected_outputs)
 
-    for r, o in zip(results, expected_outputs):
+    for r, o in zip(results, expected_outputs, strict=True):
         assert len(r) == len(o)
 
         expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o}