Skip to content

Commit 7eeeec0

Browse files
authored
Merge branch 'main' into feat/fastembed-colbert-reranker
2 parents c36f86e + 5a67b27 commit 7eeeec0

File tree

5 files changed

+43
-43
lines changed

5 files changed

+43
-43
lines changed

.github/workflows/deepeval.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
fail-fast: false
3333
matrix:
3434
os: [ubuntu-latest, windows-latest, macos-latest]
35-
python-version: ["3.9", "3.13"]
35+
python-version: ["3.10", "3.13"]
3636

3737
steps:
3838
- name: Support longpaths
@@ -50,11 +50,11 @@ jobs:
5050
- name: Install Hatch
5151
run: pip install --upgrade hatch
5252
- name: Lint
53-
if: matrix.python-version == '3.9' && runner.os == 'Linux'
53+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
5454
run: hatch run fmt-check && hatch run test:types
5555

5656
- name: Generate docs
57-
if: matrix.python-version == '3.9' && runner.os == 'Linux'
57+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
5858
run: hatch run docs
5959

6060
- name: Run tests

integrations/deepeval/pyproject.toml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,16 @@ build-backend = "hatchling.build"
55
[project]
66
name = "deepeval-haystack"
77
dynamic = ["version"]
8-
description = 'An integration of DeepEvla LLM evaluation framework with Haystack'
8+
description = 'An integration of DeepEval LLM evaluation framework with Haystack'
99
readme = "README.md"
10-
requires-python = ">=3.9"
10+
requires-python = ">=3.10"
1111
license = "Apache-2.0"
1212
keywords = []
1313
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
1414
classifiers = [
1515
"License :: OSI Approved :: Apache Software License",
1616
"Development Status :: 4 - Beta",
1717
"Programming Language :: Python",
18-
"Programming Language :: Python :: 3.9",
1918
"Programming Language :: Python :: 3.10",
2019
"Programming Language :: Python :: 3.11",
2120
"Programming Language :: Python :: 3.12",
@@ -77,7 +76,7 @@ module = ["deepeval.*"]
7776
ignore_missing_imports = true
7877

7978
[tool.ruff]
80-
target-version = "py38"
79+
target-version = "py310"
8180
line-length = 120
8281

8382
[tool.ruff.lint]

integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
2-
from typing import Any, Callable, Dict, List, Optional, Union
2+
from collections.abc import Callable
3+
from typing import Any
34

45
from haystack import DeserializationError, component, default_from_dict, default_to_dict
56

@@ -47,12 +48,12 @@ class DeepEvalEvaluator:
4748

4849
_backend_metric: BaseMetric
4950
# Wrapped for easy mocking.
50-
_backend_callable: Callable[[List[LLMTestCase], BaseMetric], EvaluationResult]
51+
_backend_callable: Callable[[list[LLMTestCase], BaseMetric], EvaluationResult]
5152

5253
def __init__(
5354
self,
54-
metric: Union[str, DeepEvalMetric],
55-
metric_params: Optional[Dict[str, Any]] = None,
55+
metric: str | DeepEvalMetric,
56+
metric_params: dict[str, Any] | None = None,
5657
):
5758
"""
5859
Construct a new DeepEval evaluator.
@@ -72,8 +73,8 @@ def __init__(
7273
expected_inputs = self.descriptor.input_parameters
7374
component.set_input_types(self, **expected_inputs)
7475

75-
@component.output_types(results=List[List[Dict[str, Any]]])
76-
def run(self, **inputs: Any) -> Dict[str, Any]:
76+
@component.output_types(results=list[list[dict[str, Any]]])
77+
def run(self, **inputs: Any) -> dict[str, Any]:
7778
"""
7879
Run the DeepEval evaluator on the provided inputs.
7980
@@ -91,7 +92,7 @@ def run(self, **inputs: Any) -> Dict[str, Any]:
9192
- `explanation` - An optional explanation of the score.
9293
"""
9394
InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs)
94-
converted_inputs: List[LLMTestCase] = list(self.descriptor.input_converter(**inputs)) # type: ignore
95+
converted_inputs: list[LLMTestCase] = list(self.descriptor.input_converter(**inputs)) # type: ignore
9596

9697
results = self._backend_callable(converted_inputs, self._backend_metric)
9798
converted_results = [
@@ -100,7 +101,7 @@ def run(self, **inputs: Any) -> Dict[str, Any]:
100101

101102
return {"results": converted_results}
102103

103-
def to_dict(self) -> Dict[str, Any]:
104+
def to_dict(self) -> dict[str, Any]:
104105
"""
105106
Serializes the component to a dictionary.
106107
@@ -128,7 +129,7 @@ def check_serializable(obj: Any) -> bool:
128129
)
129130

130131
@classmethod
131-
def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator":
132+
def from_dict(cls, data: dict[str, Any]) -> "DeepEvalEvaluator":
132133
"""
133134
Deserializes the component from a dictionary.
134135
@@ -140,7 +141,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "DeepEvalEvaluator":
140141
return default_from_dict(cls, data)
141142

142143
@staticmethod
143-
def _invoke_deepeval(test_cases: List[LLMTestCase], metric: BaseMetric) -> EvaluationResult:
144+
def _invoke_deepeval(test_cases: list[LLMTestCase], metric: BaseMetric) -> EvaluationResult:
144145
return evaluate(test_cases=test_cases, metrics=[metric])
145146

146147
def _init_backend(self):

integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import dataclasses
22
import inspect
3+
from collections.abc import Callable, Iterable, Mapping
34
from dataclasses import dataclass
45
from enum import Enum
56
from functools import partial
6-
from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Type
7+
from typing import Any
78

89
from deepeval.evaluate.types import TestResult
910
from deepeval.metrics import (
@@ -83,8 +84,8 @@ class MetricResult:
8384
"""
8485

8586
name: str
86-
score: Optional[float] = None
87-
explanation: Optional[str] = None
87+
score: float | None = None
88+
explanation: str | None = None
8889

8990
def to_dict(self):
9091
return dataclasses.asdict(self)
@@ -112,21 +113,21 @@ class MetricDescriptor:
112113
"""
113114

114115
metric: DeepEvalMetric
115-
backend: Type[BaseMetric]
116-
input_parameters: Dict[str, Type]
116+
backend: type[BaseMetric]
117+
input_parameters: dict[str, type]
117118
input_converter: Callable[[Any], Iterable[LLMTestCase]]
118-
output_converter: Callable[[TestResult], List[MetricResult]]
119-
init_parameters: Optional[Mapping[str, Type]] = None
119+
output_converter: Callable[[TestResult], list[MetricResult]]
120+
init_parameters: Mapping[str, type] | None = None
120121

121122
@classmethod
122123
def new(
123124
cls,
124125
metric: DeepEvalMetric,
125-
backend: Type[BaseMetric],
126+
backend: type[BaseMetric],
126127
input_converter: Callable[[Any], Iterable[LLMTestCase]],
127-
output_converter: Optional[Callable[[TestResult], List[MetricResult]]] = None,
128+
output_converter: Callable[[TestResult], list[MetricResult]] | None = None,
128129
*,
129-
init_parameters: Optional[Mapping[str, Type]] = None,
130+
init_parameters: Mapping[str, type] | None = None,
130131
) -> "MetricDescriptor":
131132
input_converter_signature = inspect.signature(input_converter)
132133
input_parameters = {}
@@ -175,27 +176,27 @@ def _validate_input_elements(**kwargs):
175176
raise ValueError(msg)
176177

177178
@staticmethod
178-
def validate_input_parameters(metric: DeepEvalMetric, expected: Dict[str, Any], received: Dict[str, Any]) -> None:
179+
def validate_input_parameters(metric: DeepEvalMetric, expected: dict[str, Any], received: dict[str, Any]) -> None:
179180
for param, _ in expected.items():
180181
if param not in received:
181182
msg = f"DeepEval evaluator expected input parameter '{param}' for metric '{metric}'"
182183
raise ValueError(msg)
183184

184185
@staticmethod
185186
def question_context_response(
186-
questions: List[str], contexts: List[List[str]], responses: List[str]
187+
questions: list[str], contexts: list[list[str]], responses: list[str]
187188
) -> Iterable[LLMTestCase]:
188189
InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
189-
for q, c, r in zip(questions, contexts, responses): # type: ignore
190+
for q, c, r in zip(questions, contexts, responses, strict=True): # type: ignore
190191
test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c)
191192
yield test_case
192193

193194
@staticmethod
194195
def question_context_response_ground_truth(
195-
questions: List[str], contexts: List[List[str]], responses: List[str], ground_truths: List[str]
196+
questions: list[str], contexts: list[list[str]], responses: list[str], ground_truths: list[str]
196197
) -> Iterable[LLMTestCase]:
197198
InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
198-
for q, c, r, gt in zip(questions, contexts, responses, ground_truths): # type: ignore
199+
for q, c, r, gt in zip(questions, contexts, responses, ground_truths, strict=True): # type: ignore
199200
test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c, expected_output=gt)
200201
yield test_case
201202

@@ -210,8 +211,8 @@ class OutputConverters:
210211
@staticmethod
211212
def default(
212213
metric: DeepEvalMetric,
213-
) -> Callable[[TestResult], List[MetricResult]]:
214-
def inner(output: TestResult, metric: DeepEvalMetric) -> List[MetricResult]:
214+
) -> Callable[[TestResult], list[MetricResult]]:
215+
def inner(output: TestResult, metric: DeepEvalMetric) -> list[MetricResult]:
215216
metric_name = str(metric)
216217
assert output.metrics_data
217218
assert len(output.metrics_data) == 1
@@ -227,30 +228,30 @@ def inner(output: TestResult, metric: DeepEvalMetric) -> List[MetricResult]:
227228
DeepEvalMetric.ANSWER_RELEVANCY,
228229
AnswerRelevancyMetric,
229230
InputConverters.question_context_response, # type: ignore
230-
init_parameters={"model": Optional[str]}, # type: ignore
231+
init_parameters={"model": str | None}, # type: ignore
231232
),
232233
DeepEvalMetric.FAITHFULNESS: MetricDescriptor.new(
233234
DeepEvalMetric.FAITHFULNESS,
234235
FaithfulnessMetric,
235236
InputConverters.question_context_response, # type: ignore
236-
init_parameters={"model": Optional[str]}, # type: ignore
237+
init_parameters={"model": str | None}, # type: ignore
237238
),
238239
DeepEvalMetric.CONTEXTUAL_PRECISION: MetricDescriptor.new(
239240
DeepEvalMetric.CONTEXTUAL_PRECISION,
240241
ContextualPrecisionMetric,
241242
InputConverters.question_context_response_ground_truth, # type: ignore
242-
init_parameters={"model": Optional[str]}, # type: ignore
243+
init_parameters={"model": str | None}, # type: ignore
243244
),
244245
DeepEvalMetric.CONTEXTUAL_RECALL: MetricDescriptor.new(
245246
DeepEvalMetric.CONTEXTUAL_RECALL,
246247
ContextualRecallMetric,
247248
InputConverters.question_context_response_ground_truth, # type: ignore
248-
init_parameters={"model": Optional[str]}, # type: ignore
249+
init_parameters={"model": str | None}, # type: ignore
249250
),
250251
DeepEvalMetric.CONTEXTUAL_RELEVANCE: MetricDescriptor.new(
251252
DeepEvalMetric.CONTEXTUAL_RELEVANCE,
252253
ContextualRelevancyMetric,
253254
InputConverters.question_context_response, # type: ignore
254-
init_parameters={"model": Optional[str]}, # type: ignore
255+
init_parameters={"model": str | None}, # type: ignore
255256
),
256257
}

integrations/deepeval/tests/test_evaluator.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import copy
22
import os
33
from dataclasses import dataclass
4-
from typing import Dict, Optional
54

65
import pytest
76
from deepeval.evaluate.types import EvaluationResult, TestResult
@@ -46,8 +45,8 @@ class Unserializable:
4645
@dataclass(frozen=True)
4746
class MockResult:
4847
score: float
49-
reason: Optional[str] = None
50-
score_breakdown: Optional[Dict[str, float]] = None
48+
reason: str | None = None
49+
score_breakdown: dict[str, float] | None = None
5150

5251

5352
# Only returns results for the passed metrics.
@@ -273,7 +272,7 @@ def test_evaluator_outputs(metric, inputs, expected_outputs, metric_params, monk
273272
assert isinstance(results, type(expected_outputs))
274273
assert len(results) == len(expected_outputs)
275274

276-
for r, o in zip(results, expected_outputs):
275+
for r, o in zip(results, expected_outputs, strict=True):
277276
assert len(r) == len(o)
278277

279278
expected = {(name if name is not None else str(metric), score, exp) for name, score, exp in o}

0 commit comments

Comments
 (0)