From e1f0ca8760d2c187cd321b231043cfe785d4554d Mon Sep 17 00:00:00 2001 From: "Yuri F. Albuquerque" Date: Sun, 20 Oct 2024 18:38:42 -0300 Subject: [PATCH 1/7] This commit implements the F-beta score metric for the AnswerCorrectness class. --- src/ragas/metrics/_answer_correctness.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index edbd138ec..96862ca35 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -167,6 +167,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): default_factory=LongFormAnswerPrompt ) weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) + beta: float = field(default_factory= lambda: 1.0) answer_similarity: t.Optional[AnswerSimilarity] = None sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 @@ -185,6 +186,9 @@ def __post_init__(self: t.Self): language = self.long_form_answer_prompt.language self.sentence_segmenter = get_segmenter(language=language, clean=False) + if type(self.beta) is not float: + raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.") + def init(self, run_config: RunConfig): super().init(run_config) if self.answer_similarity is None and self.weights[1] != 0: @@ -198,7 +202,8 @@ def _compute_statement_presence( tp = len(prediction.TP) fp = len(prediction.FP) fn = len(prediction.FN) - score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 + beta = self.beta + score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 return score async def _create_simplified_statements( From 1c339e983e43ad9c03e511400e9752de4b70b7bf Mon Sep 17 00:00:00 2001 From: "Yuri F. Albuquerque" Date: Mon, 21 Oct 2024 14:21:41 -0300 Subject: [PATCH 2/7] also implements the F-beta score for _factual_correctness, which is a weighted harmonic mean of precision and recall, where the recall is weighted by a factor of beta. The F-beta score is defined as: F-beta = (1 + beta^2) * (precision * recall) / (beta^2 * precision + recall) The F-beta score is a generalization of the F1 score, where beta = 1.0. The F1 score is the harmonic mean of precision and recall, and is defined as: F1 = 2 * (precision * recall) / (precision + recall) --- src/ragas/metrics/_factual_correctness.py | 25 +++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 5147cb112..2fc890018 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -186,6 +186,7 @@ class FactualCorrectness(MetricWithLLM, SingleTurnMetric): default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}} ) mode: t.Literal["precision", "recall", "f1"] = "f1" + beta: float = Field(default_factory=lambda: 1.0) atomicity: t.Literal["low", "high"] = "low" coverage: t.Literal["low", "high"] = "low" claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt() @@ -204,6 +205,9 @@ def __post_init__(self): ) self.segmenter = get_segmenter(language="english") + if type(self.beta) is not float: + raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.") + async def decompose_claims( self, response: str, callbacks: Callbacks ) -> t.List[str]: @@ -248,18 +252,21 @@ async def _single_turn_ascore( premise=response, hypothesis_list=reference_claims, callbacks=callbacks ) - true_positives = sum(reference_response) - false_positives = sum(~reference_response) - false_negatives = sum(~response_reference) + # Calculate the true positives, false positives, and false negatives + tp = sum(reference_response) + fp = sum(~reference_response) + fn = sum(~response_reference) + + beta = self.beta - if self.mode == "precision": - score = true_positives / (true_positives + false_positives + 1e-8) + if self.mode == "precision" or beta == 0: + beta = 1e-8 # to avoid any division by zero elif self.mode == "recall": - score = true_positives / (true_positives + false_negatives + 1e-8) + beta = 1e8 else: - precision = true_positives / (true_positives + false_positives + 1e-8) - recall = true_positives / (true_positives + false_negatives + 1e-8) - score = 2 * (precision * recall) / (precision + recall + 1e-8) + self.mode == "f1" + + score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 return np.round(score, 2) From ed3b3f3854dc78f026ea83b8049dc3c94c8904b5 Mon Sep 17 00:00:00 2001 From: "Yuri F. Albuquerque" Date: Tue, 22 Oct 2024 08:41:19 -0300 Subject: [PATCH 3/7] returning to the original recall and precision calculation in factual correctness and keeping the f1 score as f1-beta score as requested. --- src/ragas/metrics/_factual_correctness.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 2fc890018..015aa5623 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -252,21 +252,19 @@ async def _single_turn_ascore( premise=response, hypothesis_list=reference_claims, callbacks=callbacks ) - # Calculate the true positives, false positives, and false negatives + # Calculate the true positives (tp), false positives (fp), and false negatives (fn) tp = sum(reference_response) fp = sum(~reference_response) fn = sum(~response_reference) beta = self.beta - if self.mode == "precision" or beta == 0: - beta = 1e-8 # to avoid any division by zero + if self.mode == "precision": + score = tp / (tp + fp + 1e-8) elif self.mode == "recall": - beta = 1e8 + score = tp / (tp + fp + 1e-8) else: - self.mode == "f1" - - score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 + score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 return np.round(score, 2) From 92166c0c6633433998acd444a5fa52e747bbc013 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 25 Oct 2024 23:30:10 +0530 Subject: [PATCH 4/7] remove ALL_METRICS --- src/ragas/metrics/__init__.py | 7 ------- src/ragas/metrics/utils.py | 37 ++++++++++++++++++----------------- tests/unit/test_metric.py | 14 ------------- 3 files changed, 19 insertions(+), 39 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index f29d7cb25..9e9eb5225 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -106,10 +106,3 @@ "TopicAdherenceScore", "LLMSQLEquivalence", ] - -current_module = sys.modules[__name__] -ALL_METRICS = [ - obj - for name, obj in inspect.getmembers(current_module) - if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj) -] diff --git a/src/ragas/metrics/utils.py b/src/ragas/metrics/utils.py index 1b0b6ef2f..0f612c3fd 100644 --- a/src/ragas/metrics/utils.py +++ b/src/ragas/metrics/utils.py @@ -1,21 +1,22 @@ -from ragas.dataset_schema import EvaluationDataset -from ragas.metrics import ALL_METRICS -from ragas.metrics.base import Metric -from ragas.validation import validate_required_columns +def fbeta_score(tp, fp, fn, beta=1.0): + if tp + fp == 0: + precision = 0 + else: + precision = tp / (tp + fp) + if tp + fn == 0: + recall = 0 + else: + recall = tp / (tp + fn) -def get_available_metrics(ds: EvaluationDataset) -> list[Metric]: - """ - Get the available metrics for the given dataset. - E.g. if the dataset contains ("question", "answer", "contexts") columns, - the available metrics are those that can be evaluated in [qa, qac, qc] mode. - """ - available_metrics = [] - for metric in ALL_METRICS: - try: - validate_required_columns(ds, [metric]) - available_metrics.append(metric) - except ValueError: - pass + if precision == 0 and recall == 0: + return 0.0 - return available_metrics + beta_squared = beta**2 + fbeta = ( + (1 + beta_squared) + * (precision * recall) + / ((beta_squared * precision) + recall) + ) + + return fbeta diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 4f589318d..c40c87166 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -3,20 +3,6 @@ from ragas.dataset_schema import EvaluationDataset, SingleTurnSample from ragas.metrics.base import MetricType -from ragas.metrics.utils import get_available_metrics - - -def test_get_available_metrics(): - sample1 = SingleTurnSample(user_input="What is X", response="Y") - sample2 = SingleTurnSample(user_input="What is Z", response="W") - ds = EvaluationDataset(samples=[sample1, sample2]) - - assert all( - [ - m.required_columns["SINGLE_TURN"] == {"response", "user_input"} - for m in get_available_metrics(ds) - ] - ), "All metrics should have required columns ('user_input', 'response')" def test_single_turn_metric(): From 69cfa99003ae3a61125f95819cb6bf40b14f1495 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 25 Oct 2024 23:30:21 +0530 Subject: [PATCH 5/7] add fbeta score --- src/ragas/metrics/_factual_correctness.py | 31 +++++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 015aa5623..b1854ce93 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -16,6 +16,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -181,12 +182,32 @@ class ClaimDecompositionPrompt( @dataclass class FactualCorrectness(MetricWithLLM, SingleTurnMetric): + """ + FactualCorrectness is a metric class that evaluates the factual correctness of responses + generated by a language model. It uses claim decomposition and natural language inference (NLI) + to verify the claims made in the responses against reference texts. + + Attributes: + name (str): The name of the metric, default is "factual_correctness". + _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns + for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}. + mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision", + "recall", or "f1". Default is "f1". + beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight + to recall, while beta < 1 favors precision. Default is 1.0. + atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low". + coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low". + claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition. + nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI). + + """ + name: str = "factual_correctness" # type: ignore _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}} ) mode: t.Literal["precision", "recall", "f1"] = "f1" - beta: float = Field(default_factory=lambda: 1.0) + beta: float = 1.0 atomicity: t.Literal["low", "high"] = "low" coverage: t.Literal["low", "high"] = "low" claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt() @@ -206,7 +227,9 @@ def __post_init__(self): self.segmenter = get_segmenter(language="english") if type(self.beta) is not float: - raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.") + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) async def decompose_claims( self, response: str, callbacks: Callbacks @@ -257,14 +280,12 @@ async def _single_turn_ascore( fp = sum(~reference_response) fn = sum(~response_reference) - beta = self.beta - if self.mode == "precision": score = tp / (tp + fp + 1e-8) elif self.mode == "recall": score = tp / (tp + fp + 1e-8) else: - score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 + score = fbeta_score(tp, fp, fn, self.beta) return np.round(score, 2) From a0f6917a99d3e0645cb6547bfc5c06155be1c014 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 25 Oct 2024 23:34:16 +0530 Subject: [PATCH 6/7] replace by fbeta score --- src/ragas/metrics/_answer_correctness.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 96862ca35..aa61dd3dc 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -21,6 +21,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig @@ -167,7 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): default_factory=LongFormAnswerPrompt ) weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) - beta: float = field(default_factory= lambda: 1.0) + beta: float = 1.0 answer_similarity: t.Optional[AnswerSimilarity] = None sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 @@ -187,7 +188,9 @@ def __post_init__(self: t.Self): self.sentence_segmenter = get_segmenter(language=language, clean=False) if type(self.beta) is not float: - raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.") + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) def init(self, run_config: RunConfig): super().init(run_config) @@ -202,8 +205,7 @@ def _compute_statement_presence( tp = len(prediction.TP) fp = len(prediction.FP) fn = len(prediction.FN) - beta = self.beta - score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0 + score = fbeta_score(tp, fp, fn, self.beta) return score async def _create_simplified_statements( From c432b1969bd41894028bda1935fc5f842cfc6d02 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 25 Oct 2024 23:34:46 +0530 Subject: [PATCH 7/7] removed unused imports --- src/ragas/metrics/__init__.py | 3 --- tests/unit/test_metric.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 9e9eb5225..65fc64927 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,6 +1,3 @@ -import inspect -import sys - from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness from ragas.metrics._answer_relevance import ( AnswerRelevancy, diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index c40c87166..7c8026cba 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -1,7 +1,7 @@ import typing as t from dataclasses import dataclass, field -from ragas.dataset_schema import EvaluationDataset, SingleTurnSample +from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import MetricType