From e1f0ca8760d2c187cd321b231043cfe785d4554d Mon Sep 17 00:00:00 2001
From: "Yuri F. Albuquerque" <yuri.falbu@gmail.com>
Date: Sun, 20 Oct 2024 18:38:42 -0300
Subject: [PATCH 1/7] This commit implements the F-beta score metric for the
 AnswerCorrectness class.

---
 src/ragas/metrics/_answer_correctness.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index edbd138ec..96862ca35 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -167,6 +167,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
         default_factory=LongFormAnswerPrompt
     )
     weights: list[float] = field(default_factory=lambda: [0.75, 0.25])
+    beta: float = field(default_factory= lambda: 1.0)
     answer_similarity: t.Optional[AnswerSimilarity] = None
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
@@ -185,6 +186,9 @@ def __post_init__(self: t.Self):
             language = self.long_form_answer_prompt.language
             self.sentence_segmenter = get_segmenter(language=language, clean=False)
 
+        if type(self.beta) is not float:
+            raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
+
     def init(self, run_config: RunConfig):
         super().init(run_config)
         if self.answer_similarity is None and self.weights[1] != 0:
@@ -198,7 +202,8 @@ def _compute_statement_presence(
         tp = len(prediction.TP)
         fp = len(prediction.FP)
         fn = len(prediction.FN)
-        score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0
+        beta = self.beta
+        score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
         return score
 
     async def _create_simplified_statements(

From 1c339e983e43ad9c03e511400e9752de4b70b7bf Mon Sep 17 00:00:00 2001
From: "Yuri F. Albuquerque" <yuri.falbu@gmail.com>
Date: Mon, 21 Oct 2024 14:21:41 -0300
Subject: [PATCH 2/7] also implements the F-beta score for
 _factual_correctness, which is a weighted harmonic mean of precision and
 recall, where the recall is weighted by a factor of beta. The F-beta score is
 defined as: F-beta = (1 + beta^2) * (precision * recall) / (beta^2 *
 precision + recall) The F-beta score is a generalization of the F1 score,
 where beta = 1.0. The F1 score is the harmonic mean of precision and recall,
 and is defined as: F1 = 2 * (precision * recall) / (precision + recall)

---
 src/ragas/metrics/_factual_correctness.py | 25 +++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
index 5147cb112..2fc890018 100644
--- a/src/ragas/metrics/_factual_correctness.py
+++ b/src/ragas/metrics/_factual_correctness.py
@@ -186,6 +186,7 @@ class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
         default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
     )
     mode: t.Literal["precision", "recall", "f1"] = "f1"
+    beta: float = Field(default_factory=lambda: 1.0)
     atomicity: t.Literal["low", "high"] = "low"
     coverage: t.Literal["low", "high"] = "low"
     claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
@@ -204,6 +205,9 @@ def __post_init__(self):
             )
         self.segmenter = get_segmenter(language="english")
 
+        if type(self.beta) is not float:
+            raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
+
     async def decompose_claims(
         self, response: str, callbacks: Callbacks
     ) -> t.List[str]:
@@ -248,18 +252,21 @@ async def _single_turn_ascore(
             premise=response, hypothesis_list=reference_claims, callbacks=callbacks
         )
 
-        true_positives = sum(reference_response)
-        false_positives = sum(~reference_response)
-        false_negatives = sum(~response_reference)
+        # Calculate the true positives, false positives, and false negatives
+        tp = sum(reference_response)
+        fp = sum(~reference_response)
+        fn = sum(~response_reference)
+
+        beta = self.beta
 
-        if self.mode == "precision":
-            score = true_positives / (true_positives + false_positives + 1e-8)
+        if self.mode == "precision" or beta == 0:
+            beta = 1e-8 # to avoid any division by zero 
         elif self.mode == "recall":
-            score = true_positives / (true_positives + false_negatives + 1e-8)
+            beta = 1e8
         else:
-            precision = true_positives / (true_positives + false_positives + 1e-8)
-            recall = true_positives / (true_positives + false_negatives + 1e-8)
-            score = 2 * (precision * recall) / (precision + recall + 1e-8)
+            self.mode == "f1" 
+        
+        score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
 
         return np.round(score, 2)
 

From ed3b3f3854dc78f026ea83b8049dc3c94c8904b5 Mon Sep 17 00:00:00 2001
From: "Yuri F. Albuquerque" <yuri.falbu@gmail.com>
Date: Tue, 22 Oct 2024 08:41:19 -0300
Subject: [PATCH 3/7] returning to the original recall and precision
 calculation in factual correctness and keeping the f1 score as f1-beta score
 as requested.

---
 src/ragas/metrics/_factual_correctness.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
index 2fc890018..015aa5623 100644
--- a/src/ragas/metrics/_factual_correctness.py
+++ b/src/ragas/metrics/_factual_correctness.py
@@ -252,21 +252,19 @@ async def _single_turn_ascore(
             premise=response, hypothesis_list=reference_claims, callbacks=callbacks
         )
 
-        # Calculate the true positives, false positives, and false negatives
+        # Calculate the true positives (tp), false positives (fp), and false negatives (fn)
         tp = sum(reference_response)
         fp = sum(~reference_response)
         fn = sum(~response_reference)
 
         beta = self.beta
 
-        if self.mode == "precision" or beta == 0:
-            beta = 1e-8 # to avoid any division by zero 
+        if self.mode == "precision":
+            score = tp / (tp + fp + 1e-8)
         elif self.mode == "recall":
-            beta = 1e8
+            score = tp / (tp + fp + 1e-8)
         else:
-            self.mode == "f1" 
-        
-        score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
+            score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
 
         return np.round(score, 2)
 

From 92166c0c6633433998acd444a5fa52e747bbc013 Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 25 Oct 2024 23:30:10 +0530
Subject: [PATCH 4/7] remove ALL_METRICS

---
 src/ragas/metrics/__init__.py |  7 -------
 src/ragas/metrics/utils.py    | 37 ++++++++++++++++++-----------------
 tests/unit/test_metric.py     | 14 -------------
 3 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
index f29d7cb25..9e9eb5225 100644
--- a/src/ragas/metrics/__init__.py
+++ b/src/ragas/metrics/__init__.py
@@ -106,10 +106,3 @@
     "TopicAdherenceScore",
     "LLMSQLEquivalence",
 ]
-
-current_module = sys.modules[__name__]
-ALL_METRICS = [
-    obj
-    for name, obj in inspect.getmembers(current_module)
-    if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj)
-]
diff --git a/src/ragas/metrics/utils.py b/src/ragas/metrics/utils.py
index 1b0b6ef2f..0f612c3fd 100644
--- a/src/ragas/metrics/utils.py
+++ b/src/ragas/metrics/utils.py
@@ -1,21 +1,22 @@
-from ragas.dataset_schema import EvaluationDataset
-from ragas.metrics import ALL_METRICS
-from ragas.metrics.base import Metric
-from ragas.validation import validate_required_columns
+def fbeta_score(tp, fp, fn, beta=1.0):
+    if tp + fp == 0:
+        precision = 0
+    else:
+        precision = tp / (tp + fp)
 
+    if tp + fn == 0:
+        recall = 0
+    else:
+        recall = tp / (tp + fn)
 
-def get_available_metrics(ds: EvaluationDataset) -> list[Metric]:
-    """
-    Get the available metrics for the given dataset.
-    E.g. if the dataset contains ("question", "answer", "contexts") columns,
-    the available metrics are those that can be evaluated in [qa, qac, qc] mode.
-    """
-    available_metrics = []
-    for metric in ALL_METRICS:
-        try:
-            validate_required_columns(ds, [metric])
-            available_metrics.append(metric)
-        except ValueError:
-            pass
+    if precision == 0 and recall == 0:
+        return 0.0
 
-    return available_metrics
+    beta_squared = beta**2
+    fbeta = (
+        (1 + beta_squared)
+        * (precision * recall)
+        / ((beta_squared * precision) + recall)
+    )
+
+    return fbeta
diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py
index 4f589318d..c40c87166 100644
--- a/tests/unit/test_metric.py
+++ b/tests/unit/test_metric.py
@@ -3,20 +3,6 @@
 
 from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
 from ragas.metrics.base import MetricType
-from ragas.metrics.utils import get_available_metrics
-
-
-def test_get_available_metrics():
-    sample1 = SingleTurnSample(user_input="What is X", response="Y")
-    sample2 = SingleTurnSample(user_input="What is Z", response="W")
-    ds = EvaluationDataset(samples=[sample1, sample2])
-
-    assert all(
-        [
-            m.required_columns["SINGLE_TURN"] == {"response", "user_input"}
-            for m in get_available_metrics(ds)
-        ]
-    ), "All metrics should have required columns ('user_input', 'response')"
 
 
 def test_single_turn_metric():

From 69cfa99003ae3a61125f95819cb6bf40b14f1495 Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 25 Oct 2024 23:30:21 +0530
Subject: [PATCH 5/7] add fbeta score

---
 src/ragas/metrics/_factual_correctness.py | 31 +++++++++++++++++++----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
index 015aa5623..b1854ce93 100644
--- a/src/ragas/metrics/_factual_correctness.py
+++ b/src/ragas/metrics/_factual_correctness.py
@@ -16,6 +16,7 @@
     SingleTurnMetric,
     get_segmenter,
 )
+from ragas.metrics.utils import fbeta_score
 from ragas.prompt import PydanticPrompt
 
 if t.TYPE_CHECKING:
@@ -181,12 +182,32 @@ class ClaimDecompositionPrompt(
 
 @dataclass
 class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
+    """
+    FactualCorrectness is a metric class that evaluates the factual correctness of responses
+    generated by a language model. It uses claim decomposition and natural language inference (NLI)
+    to verify the claims made in the responses against reference texts.
+
+    Attributes:
+        name (str): The name of the metric, default is "factual_correctness".
+        _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns
+            for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}.
+        mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision",
+            "recall", or "f1". Default is "f1".
+        beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight
+            to recall, while beta < 1 favors precision. Default is 1.0.
+        atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low".
+        coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low".
+        claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition.
+        nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI).
+
+    """
+
     name: str = "factual_correctness"  # type: ignore
     _required_columns: t.Dict[MetricType, t.Set[str]] = field(
         default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
     )
     mode: t.Literal["precision", "recall", "f1"] = "f1"
-    beta: float = Field(default_factory=lambda: 1.0)
+    beta: float = 1.0
     atomicity: t.Literal["low", "high"] = "low"
     coverage: t.Literal["low", "high"] = "low"
     claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
@@ -206,7 +227,9 @@ def __post_init__(self):
         self.segmenter = get_segmenter(language="english")
 
         if type(self.beta) is not float:
-            raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
 
     async def decompose_claims(
         self, response: str, callbacks: Callbacks
@@ -257,14 +280,12 @@ async def _single_turn_ascore(
         fp = sum(~reference_response)
         fn = sum(~response_reference)
 
-        beta = self.beta
-
         if self.mode == "precision":
             score = tp / (tp + fp + 1e-8)
         elif self.mode == "recall":
             score = tp / (tp + fp + 1e-8)
         else:
-            score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
+            score = fbeta_score(tp, fp, fn, self.beta)
 
         return np.round(score, 2)
 

From a0f6917a99d3e0645cb6547bfc5c06155be1c014 Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 25 Oct 2024 23:34:16 +0530
Subject: [PATCH 6/7] replace by fbeta score

---
 src/ragas/metrics/_answer_correctness.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index 96862ca35..aa61dd3dc 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -21,6 +21,7 @@
     SingleTurnMetric,
     get_segmenter,
 )
+from ragas.metrics.utils import fbeta_score
 from ragas.prompt import PydanticPrompt
 from ragas.run_config import RunConfig
 
@@ -167,7 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
         default_factory=LongFormAnswerPrompt
     )
     weights: list[float] = field(default_factory=lambda: [0.75, 0.25])
-    beta: float = field(default_factory= lambda: 1.0)
+    beta: float = 1.0
     answer_similarity: t.Optional[AnswerSimilarity] = None
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
@@ -187,7 +188,9 @@ def __post_init__(self: t.Self):
             self.sentence_segmenter = get_segmenter(language=language, clean=False)
 
         if type(self.beta) is not float:
-            raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
 
     def init(self, run_config: RunConfig):
         super().init(run_config)
@@ -202,8 +205,7 @@ def _compute_statement_presence(
         tp = len(prediction.TP)
         fp = len(prediction.FP)
         fn = len(prediction.FN)
-        beta = self.beta
-        score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
+        score = fbeta_score(tp, fp, fn, self.beta)
         return score
 
     async def _create_simplified_statements(

From c432b1969bd41894028bda1935fc5f842cfc6d02 Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 25 Oct 2024 23:34:46 +0530
Subject: [PATCH 7/7] removed unused imports

---
 src/ragas/metrics/__init__.py | 3 ---
 tests/unit/test_metric.py     | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
index 9e9eb5225..65fc64927 100644
--- a/src/ragas/metrics/__init__.py
+++ b/src/ragas/metrics/__init__.py
@@ -1,6 +1,3 @@
-import inspect
-import sys
-
 from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness
 from ragas.metrics._answer_relevance import (
     AnswerRelevancy,
diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py
index c40c87166..7c8026cba 100644
--- a/tests/unit/test_metric.py
+++ b/tests/unit/test_metric.py
@@ -1,7 +1,7 @@
 import typing as t
 from dataclasses import dataclass, field
 
-from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
+from ragas.dataset_schema import SingleTurnSample
 from ragas.metrics.base import MetricType