From b32d782ac37a0e211beaa3b3c00d39d24099b28a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 11:36:23 +0200
Subject: [PATCH 01/83] add tests for grouped instance metrics

---
 tests/test_metrics.py | 226 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 22ff0404a8..eb1eaca1d0 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -9,6 +9,11 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
+    MeanGroupedAccuracy,
+    MeanGroupedAccuracyPDR,
+    MeanGroupedStringContainment,
+    MeanGroupedStringContainmentPDR,
+    MeanGroupedTokenOverlap,
     Rouge,
     Squad,
     TokenOverlap,
@@ -17,6 +22,91 @@
 
 logger = get_logger()
 
+# values of inputs that are common to grouped_mean type InstanceMetric
+GROUPED_INSTANCE_PREDICTIONS = [
+    "A B",
+    "BC D",
+    "C",
+    "123",
+    "BCD",
+    10,
+    "  BD",
+    "AB",
+    "I am a dog",
+    "AB C",
+    "AB 1",
+    "GMA",
+    0.123,
+    "BD",
+    "abc",
+]
+
+GROUPED_INSTANCE_REFERENCES = [
+    ["B", "AB", "A"],
+    ["A", "BC D", "BC DF"],
+    ["c", " C"],
+    [13, 23, 234],
+    ["  ", " BD", " BDA"],
+    [1, 10, 100],
+    ["A", "B", "BD"],
+    ["ABC", "ab", "BC"],
+    ["I am a person", "I AM A DOG", "ABC"],
+    ["AB CD", "AB", "ab"],
+    ["AB 1", "AB1"],
+    [" GMA 123", "GMA"],
+    ["123", 0.12],
+    ["BDE", "BCE", "bdefs"],
+    [" abcdefg", "AB", "abcd"],
+]
+
+GROUPED_INSTANCE_PREDICTIONS_SHORT = [
+    "A",
+    "B",
+    "B",
+    "A",
+    "B",
+    "B",
+    "A",
+    "A",
+    "B",
+    "B",
+    "A",
+    "B",
+    "A",
+    "A",
+    "B",
+]
+
+GROUPED_INSTANCE_REFERENCES_SHORT = [
+    ["A", "B"],
+    ["A", "C"],
+    ["B", "C", "A"],
+    ["A"],
+    ["B", "A"],
+    ["C", "B"],
+    ["A"],
+    ["B", "C"],
+    ["A", "B", "C"],
+    ["A", "B"],
+    ["B", "C"],
+    ["C"],
+    ["C", "B"],
+    ["B", "A"],
+    ["B"],
+]
+
+# possibly multi-column group identifier
+GROUPED_INSTANCE_ADDL_INPUTS = (
+    [{"group": "grp1", "id": 0}] * 5
+    + [{"group": "grp1", "id": 1}] * 5
+    + [{"group": "grp2", "id": 0}] * 4
+    + [{"group": "grp2", "id": 1}] * 1
+)
+group_by_fields = ["group", "id"]
+# construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
+for ai in GROUPED_INSTANCE_ADDL_INPUTS:
+    ai.update({"group_id": "_".join([str(ai[ff]) for ff in group_by_fields])})
+
 
 class TestMetrics(unittest.TestCase):
     def test_accuracy(self):
@@ -388,6 +478,42 @@ def test_token_overlap(self):
         for target, value in global_targets.items():
             self.assertAlmostEqual(value, outputs[0]["score"]["global"][target])
 
+    def test_grouped_instance_metrics(self):
+        accuracy_metrics = [
+            MeanGroupedAccuracy(),
+            MeanGroupedStringContainment(),
+            MeanGroupedAccuracyPDR(),
+            MeanGroupedStringContainmentPDR(),
+        ]
+        global_targets = [0.225, 0.4875, 0.8333333333333334, 0.4444444444444445]
+        for metric, target in zip(accuracy_metrics, global_targets):
+            outputs = apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+            self.assertAlmostEqual(
+                target,
+                outputs[0]["score"]["global"]["score"],
+                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
+            )
+
+        f1_metrics = [MeanGroupedTokenOverlap()]
+        global_targets = [0.5]
+        for metric, target in zip(f1_metrics, global_targets):
+            outputs = apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
+                references=GROUPED_INSTANCE_REFERENCES_SHORT,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+            self.assertAlmostEqual(
+                target,
+                outputs[0]["score"]["global"]["score"],
+                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
+            )
+
 
 class TestConfidenceIntervals(unittest.TestCase):
     def test_confidence_interval_off(self):
@@ -478,3 +604,103 @@ def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high):
                     or score_name not in metric.ci_scores,
                     msg=f"Unexpected confidence interval score '{score_name}'.",
                 )
+
+    def test_grouped_instance_metric_confidence_interval(self):
+        import numpy as np
+        """Test the calculation of confidence intervals for grouped instance metrics (a subclass of global metrics)."""
+        self._test_grouped_instance_confidence_interval(
+            metric=MeanGroupedAccuracy(),
+            expected_ci_low=0.025,
+            expected_ci_high=0.44047619047619047,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=MeanGroupedStringContainment(),
+            expected_ci_low=0.15627449950197503,
+            expected_ci_high=0.7080527276705951,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=MeanGroupedAccuracyPDR(),
+            expected_ci_low=np.nan,
+            expected_ci_high=np.nan,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=MeanGroupedStringContainmentPDR(),
+            expected_ci_low=np.nan,
+            expected_ci_high=np.nan,
+        )
+
+        # F1-based scores
+        self._test_grouped_instance_confidence_interval(
+            metric=MeanGroupedTokenOverlap(),
+            references=GROUPED_INSTANCE_REFERENCES_SHORT,
+            predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
+            expected_global_result={"group_mean_f1": 0.5,
+                                    "score": 0.5,
+                                    "score_name": "group_mean_f1",
+                                    "group_mean_f1_ci_low": 0.32199800893327996,
+                                    "group_mean_f1_ci_high": 0.7899498235031469,
+                                    "score_ci_low": 0.32199800893327996,
+                                    "score_ci_high": 0.7899498235031469,
+                                    "group_mean_precision": 0.5,
+                                    "group_mean_precision_ci_low": 0.32199800893327996,
+                                    "group_mean_precision_ci_high": 0.7899498235031469,
+                                    "group_mean_recall": 0.5,
+                                    "group_mean_recall_ci_low": 0.32199800893327996,
+                                    "group_mean_recall_ci_high": 0.7899498235031469}
+
+        )
+
+    def _test_grouped_instance_confidence_interval(
+            self, metric, expected_ci_low=0.0, expected_ci_high=1.0,
+            references=GROUPED_INSTANCE_REFERENCES, predictions=GROUPED_INSTANCE_PREDICTIONS, expected_global_result=None
+    ):
+        """Test the calculation of confidence intervals for a given metric."""
+        import numpy as np
+
+        outputs = apply_metric(
+            metric=metric,
+            predictions=predictions,
+            references=references,
+            additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+        )
+
+        group_score_name = "_".join(["group", metric.reduction_map["group_mean"]["agg_func"][0], metric.main_score])
+
+        if expected_global_result is None:
+            expected_global_result = {
+                f"{group_score_name}_ci_low": expected_ci_low,
+                f"{group_score_name}_ci_high": expected_ci_high,
+                "score_ci_low": expected_ci_low,
+                "score_ci_high": expected_ci_high,
+            }
+
+        global_result = outputs[0]["score"]["global"].copy()
+        logger.info(global_result)
+        for score_name, score_value in global_result.items():
+            if score_name in expected_global_result:
+                # Test that the output value is the same as the expected value
+                # allow for cases where value is NaN
+                if not isinstance(score_value, str):
+                    if np.isnan(expected_global_result[score_name]):
+                        assert np.isnan(score_value)
+                    elif np.isnan(score_value):
+                        assert np.isnan(expected_global_result[score_name])
+                    else:
+                        self.assertAlmostEqual(
+                            score_value, expected_global_result[score_name], places=5, msg=f"score mismatch for {group_score_name}"
+                        )
+                else:
+                    self.assertEqual(score_value, expected_global_result[score_name])
+            else:
+                # An output score that is not expected
+                # This is ok if the score_name is not related to confidence intervals
+                # Otherwise, there was some confidence interval calculation that was not supposed to occur.
+                self.assertTrue(
+                    "ci_low" not in score_name and "ci_high" not in score_name,
+                    msg=f"Unexpected confidence interval score '{score_name}'.",
+                )
+
+

From a797cdc00126b341cac1d16066af194a2adc58c9 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 11:39:16 +0200
Subject: [PATCH 02/83] modify InstanceMetric to accept grouped_mean reduction

---
 src/unitxt/metrics.py | 254 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 215 insertions(+), 39 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 96516c4eab..1296911ca7 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2,8 +2,9 @@
 import string
 import uuid
 from abc import abstractmethod
-from collections import Counter
+from collections import Counter, defaultdict
 from dataclasses import field
+from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
 
 import evaluate
@@ -82,7 +83,48 @@ def _can_compute_confidence_intervals(self, num_predictions):
             and num_predictions > 1
         )
 
-    def score_based_confidence_interval(self, instances):
+    # def score_based_confidence_interval(self, instances, statistic=None, score_names: Optional[List[str]] = None, func_name=""):
+    #     """Compute confidence intervals based on existing scores, already computed on the input instances.
+    #
+    #     score_names: List[str]
+    #         Compute a confidence interval for each score_name from this list.
+    #     instances:
+    #         The instances for which the confidence intervals are computed.
+    #     """
+    #     result = {}
+    #
+    #     if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
+    #         return result
+    #
+    #     if score_names is None:
+    #         score_names = (
+    #             self.ci_scores if self.ci_scores is not None else [self.main_score]
+    #         )
+    #         if statistic is not None:
+    #             def statistic(instances, field_name):
+    #                 return mean([instance[field_name] for instance in instances])
+    #
+    #     for score_name in score_names:
+    #         def statistic_wrap(x, field_name=score_name):
+    #             return statistic(instances=x, field_name=field_name)
+    #
+    #         ci = bootstrap(
+    #             (instances,),
+    #             statistic=statistic_wrap,#lambda x: statistic(instances=x, field_name=score_name),
+    #             n_resamples=self.n_resamples,
+    #             confidence_level=self.confidence_level,
+    #             random_state=self.new_random_generator(),
+    #         ).confidence_interval
+    #         full_score_name = score_name if len(func_name) == 0 else "_".join([str(func_name), score_name])
+    #         result[f"{full_score_name}_ci_low"] = ci.low
+    #         result[f"{full_score_name}_ci_high"] = ci.high
+    #         if score_name == self.main_score:
+    #             result["score_ci_low"] = ci.low
+    #             result["score_ci_high"] = ci.high
+    #     return result
+
+
+    def score_based_confidence_interval(self, instances, aggregation_func=None, score_names: Optional[List[str]] = None, func_name=""):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
 
         score_names: List[str]
@@ -90,35 +132,64 @@ def score_based_confidence_interval(self, instances):
         instances:
             The instances for which the confidence intervals are computed.
         """
-        from statistics import mean
-
         result = {}
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
+        identifiers = list(range(len(instances)))
+
+
+
+        if score_names is None:
+            score_names = (
+                self.ci_scores if self.ci_scores is not None else [self.main_score]
+            )
+            if aggregation_func is not None:
+                def aggregation_func(instances, field_name):
+                    return mean([instance[field_name] for instance in instances])
 
-        score_names = (
-            self.ci_scores if self.ci_scores is not None else [self.main_score]
-        )
 
         for score_name in score_names:
-            scores = [
-                instance["score"]["instance"][score_name] for instance in instances
-            ]
+            def statistic(arr, axis, score_name=score_name):
+                # arr is a 2d array where each row is a resampling, so we
+                # iterate over the rows and compute the metric on each resampling
+                def metric(instances):
+                    try:
+                        return aggregation_func(instances, score_name)
+                    except Exception as e:
+                        # this happens in edge cases, for example, when the sampling creates a
+                        # sample where all strings are empty and this fails bleu.
+                        logger.info(f"Warning in {self.__class__.__name__}", e)
+                        return np.nan
+
+                scores = numpy.apply_along_axis(
+                    lambda x: metric([instances[ii] for ii in x],
+                                     ),
+                    axis=axis,
+                    arr=arr,
+                )
+                return scores
+
+
             ci = bootstrap(
-                (scores,),
-                statistic=mean,
+                (identifiers,),
+                statistic=statistic,
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
-            result[f"{score_name}_ci_low"] = ci.low
-            result[f"{score_name}_ci_high"] = ci.high
+            full_score_name = score_name if len(func_name) == 0 else "_".join([str(func_name), score_name])
+            result[f"{full_score_name}_ci_low"] = ci.low
+            result[f"{full_score_name}_ci_high"] = ci.high
             if score_name == self.main_score:
                 result["score_ci_low"] = ci.low
                 result["score_ci_high"] = ci.high
         return result
 
+
+
+
+
     def compute_global_confidence_intervals(
         self, references, predictions, additional_inputs, score_name
     ):
@@ -334,8 +405,6 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
 
             if reduction == "mean":
-                from statistics import mean
-
                 for field_name in fields:
                     global_score[field_name] = mean(
                         [
@@ -368,7 +437,11 @@ def compute(
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
 
-    implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
+    implemented_reductions: List[str] = field(default_factory=lambda: ["mean", "group_mean"])
+
+    # for grouped metrics: a field that contains the group id. None to disable grouping.
+    # Grouped metrics aggregate the instance score per group, and then average over group scores.
+    grouping_field: str = None
 
     @property
     @abstractmethod
@@ -376,6 +449,59 @@ def reduction_map(self) -> dict:
         pass
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+
+        instances, global_score = self.compute_instance_scores(stream, stream_name)
+
+        for reduction, fields in self.reduction_map.items():
+            assert (
+                reduction in self.implemented_reductions
+            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
+
+            aggregation_func = None
+            if reduction == "mean":
+                aggregation_func = self.aggregate
+
+            if reduction == "group_mean":
+                if not self.grouping_field:
+                    raise ValueError("self.grouping_field is None, . "
+                                     "This field is required for group based metric computation.")
+                # # for group_mean, expects a dict
+                assert isinstance(fields, dict)
+                assert "agg_func" in fields, "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
+                assert callable(fields["agg_func"][1]), "second item in fields['agg_func'] should be a callable function"
+                score_fields = [self.main_score] if "score_fields" not in fields else fields["score_fields"]
+                def aggregation_func(instances, field_name, field=fields["agg_func"][1]):
+                    return self.grouped_aggregate(instances, field_name, field)
+
+            if not aggregation_func:
+                raise ValueError(f"No aggregation_func was defined for reduction {reduction}. "
+                                 f"Please specify a valid reduction method in reduction_map {self.reduction_map}.")
+
+            for field_name in (score_fields if reduction == "group_mean" else fields):
+                if reduction == "group_mean":
+                    field_name_full_prefix = "group_" + str(fields["agg_func"][0])
+                    field_name_full = "_".join([field_name_full_prefix, field_name])
+                else:
+                    field_name_full_prefix = ""
+                    field_name_full = field_name
+                global_score[field_name_full] = aggregation_func(instances, field_name)
+                if field_name == self.main_score:
+                    global_score["score"] = global_score[field_name_full]
+                    global_score["score_name"] = field_name_full
+                # lambda instances, score_name: [fv for fv in [aggregation_func(instances=instances, field_name=score_name)] if not np.isnan(fv)]
+                def bootstrap_aggregation_func(instances, field_name, agg_func=aggregation_func):
+                    return agg_func(instances=instances, field_name=field_name)
+
+                confidence_interval = self.score_based_confidence_interval(
+                    instances=instances, aggregation_func=bootstrap_aggregation_func,
+                    score_names=[field_name], func_name=field_name_full_prefix
+                )
+                global_score.update(confidence_interval)
+
+        for instance in instances:
+            yield from instance
+
+    def compute_instance_scores(self, stream: Stream, stream_name: Optional[str] = None):
         global_score = {}
         instances = []
 
@@ -399,31 +525,37 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
 
             instances.append(instance)
 
-        for reduction, fields in self.reduction_map.items():
-            assert (
-                reduction in self.implemented_reductions
-            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
+        for instance in instances:
+            yield instance
 
-            if reduction == "mean":
-                from statistics import mean
+        return instances, global_score
 
-                for field_name in fields:
-                    scores = [
-                        instance["score"]["instance"][field_name]
-                        for instance in instances
-                    ]
-                    global_score[field_name] = mean(scores)
-                    if field_name == self.main_score:
-                        global_score["score"] = global_score[field_name]
-                        global_score["score_name"] = self.main_score
-
-                confidence_interval = self.score_based_confidence_interval(
-                    instances=instances
-                )
-                global_score.update(confidence_interval)
+    @staticmethod
+    def aggregate(instances, field_name):
+        scores = [
+            instance["score"]["instance"][field_name]
+            for instance in instances
+        ]
+        return np.nanmean(scores)
 
+    def grouped_aggregate(self, instances, field_name, aggregation_func):
+        group_to_instance_scores = defaultdict(list)
         for instance in instances:
-            yield instance
+            additional_inputs = instance["additional_inputs"]
+            if self.grouping_field not in additional_inputs:
+                raise ValueError(f"Missing '{self.grouping_field}' from instance {instance}. "
+                                 f"This field is required for group based metric computation.")
+            group_key = additional_inputs[self.grouping_field]  # do we need to convert to str?
+            group_to_instance_scores[group_key].append(
+                instance["score"]["instance"][field_name]
+            )
+
+        group_total_scores = [
+            aggregation_func(scores) for scores in group_to_instance_scores.values()
+        ]
+        group_total_scores = [score for score in group_total_scores if not np.isnan(score)]
+        # ignore NaNs in aggregation
+        return mean(group_total_scores) if len(group_total_scores) else np.nan
 
     @abstractmethod
     def compute(
@@ -489,7 +621,7 @@ def compute(
     ) -> dict:
         result = {
             self.main_score: float(
-                any(str(reference) in prediction for reference in references)
+                any(str(reference) in str(prediction) for reference in references)
             )
         }
         result["score"] = result[self.main_score]
@@ -1436,3 +1568,47 @@ def _compute(
             for k in self.k_list:
                 result[self.score_name(measure_name, k)] = measure_array[min(k, max_k)]
         return result
+
+
+# define metrics that return means of an aggregation function applied across levels of a grouping variable
+def performance_drop_rate(instance_scores: List):
+    """Percentage change of mean performance on test elements relative to that on a baseline.
+
+    from https://arxiv.org/pdf/2306.04528.pdf.
+
+    Args:
+        instance_scores: a list of scores on instances.  Assume the first element is the original, the others are test set
+
+    Returns:
+        numeric PDR metric.
+        If only one element (no test set) or the first is 0 (percentage change is undefined) return NaN
+        otherwise, calculate PDR
+
+    """
+    assert isinstance(instance_scores, list)
+    return (
+        np.nan
+        if (len(instance_scores) < 2 or instance_scores[0] == 0)
+        else 1 - mean(instance_scores[1:]) / instance_scores[0]
+    )
+
+
+class MeanGroupedAccuracy(Accuracy):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+
+class MeanGroupedAccuracyPDR(Accuracy):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
+
+class MeanGroupedStringContainment(StringContainment):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean]}}
+
+class MeanGroupedStringContainmentPDR(StringContainment):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
+
+class MeanGroupedTokenOverlap(TokenOverlap):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean], "score_fields": ["f1", "precision", "recall"]}}

From 0d63164138c6d2f8cb9623c099aa3552788c3858 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 11:40:14 +0200
Subject: [PATCH 03/83] initial commit

---
 prepare/metrics/grouped_instance_metrics.py | 354 ++++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 prepare/metrics/grouped_instance_metrics.py

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
new file mode 100644
index 0000000000..a40a591fb8
--- /dev/null
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -0,0 +1,354 @@
+import numpy as np
+
+from src.unitxt import add_to_catalog
+from src.unitxt.metrics import (
+    MeanGroupedAccuracy,
+    MeanGroupedAccuracyPDR,
+    MeanGroupedStringContainment,
+    MeanGroupedStringContainmentPDR,
+    MeanGroupedTokenOverlap,
+)
+from src.unitxt.test_utils.metrics import test_metric
+
+predictions = [
+    "A B",
+    "BC D",
+    "C",
+    "123",
+    "BCD",
+    10,
+    "  BD",
+    "AB",
+    "I am a dog",
+    "AB C",
+    "AB 1",
+    "GMA",
+    0.123,
+    "BD",
+    "abc",
+]
+
+references = [
+    ["B", "AB", "A"],
+    ["A", "BC D", "BC DF"],
+    ["c", " C"],
+    [13, 23, 234],
+    ["  ", " BD", " BDA"],
+    [1, 10, 100],
+    ["A", "B", "BD"],
+    ["ABC", "ab", "BC"],
+    ["I am a person", "I AM A DOG", "ABC"],
+    ["AB CD", "AB", "ab"],
+    ["AB 1", "AB1"],
+    [" GMA 123", "GMA"],
+    ["123", 0.12],
+    ["BDE", "BCE", "bdefs"],
+    [" abcdefg", "AB", "abcd"],
+]
+
+# possibly multi-column group identifier
+additional_inputs = (
+    [{"group": "grp1", "id": 0, "ignore": 1}] * 5
+    + [{"group": "grp1", "id": 1, "ignore": 1}] * 5
+    + [{"group": "grp2", "id": 0, "ignore": 1}] * 4
+    + [{"group": "grp2", "id": 1, "ignore": 0}] * 1
+)
+
+group_by_fields = ["group", "id"]
+# construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
+for ai in additional_inputs:
+    ai.update({"group_id": "_".join([str(ai[ff]) for ff in group_by_fields])})
+
+
+instance_targets_string_containment = [
+    {"score": 1.0
+    },
+    {
+        "score": 1.0
+    },
+    {
+        "score": 0.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 0.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 0.0,
+    },
+    {
+        "score": 0.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 1.0,
+    },
+    {
+        "score": 0.0,
+    },
+    {
+        "score": 0.0,
+    },
+]
+
+for instance in instance_targets_string_containment:
+    instance.update({"string_containment": instance["score"], "score_name": "string_containment"})
+
+instance_targets_accuracy = [
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 0.0},
+]
+
+for instance in instance_targets_accuracy:
+    instance.update({"accuracy": instance["score"], "score_name": "accuracy"})
+
+metric = MeanGroupedAccuracy()
+global_target = {
+    "group_mean_accuracy": 0.23,
+    "score": 0.23,
+    "score_name": "group_mean_accuracy",
+    "score_ci_low": 0.02,
+    "score_ci_high": 0.44,
+    "group_mean_accuracy_ci_low": 0.02,
+    "group_mean_accuracy_ci_high": 0.44,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_mean_accuracy", overwrite=True)
+
+
+metric = MeanGroupedStringContainment()
+global_target = {
+    "group_mean_string_containment": 0.49,
+    "score": 0.49,
+    "score_name": "group_mean_string_containment",
+    "score_ci_low": 0.16,
+    "score_ci_high": 0.71,
+    "group_mean_string_containment_ci_low": 0.16,
+    "group_mean_string_containment_ci_high": 0.71,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_mean_string_containment", overwrite=True)
+
+
+# PDR
+metric = MeanGroupedAccuracyPDR()
+global_target = {
+    "group_pdr_accuracy": 0.83,
+    "score": 0.83,
+    "score_name": "group_pdr_accuracy",
+    "score_ci_low": np.nan,
+    "score_ci_high": np.nan,
+    "group_pdr_accuracy_ci_low": np.nan,
+    "group_pdr_accuracy_ci_high": np.nan,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_pdr_accuracy", overwrite=True)
+
+
+metric = MeanGroupedStringContainmentPDR()
+global_target = {
+    "group_pdr_string_containment": 0.44,
+    "score": 0.44,
+    "score_name": "group_pdr_string_containment",
+    "score_ci_low": np.nan,
+    "score_ci_high": np.nan,
+    "group_pdr_string_containment_ci_low": np.nan,
+    "group_pdr_string_containment_ci_high": np.nan,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_pdr_string_containment", overwrite=True)
+
+
+# create references and predictions with only 3 unique values
+short_predictions = [
+    "A",
+    "B",
+    "B",
+    "A",
+    "B",
+    "B",
+    "A",
+    "A",
+    "B",
+    "B",
+    "A",
+    "B",
+    "A",
+    "A",
+    "B",
+]
+
+short_references = [
+    ["A", "B"],
+    ["A", "C"],
+    ["B", "C", "A"],
+    ["A"],
+    ["B", "A"],
+    ["C", "B"],
+    ["A"],
+    ["B", "C"],
+    ["A", "B", "C"],
+    ["A", "B"],
+    ["B", "C"],
+    ["C"],
+    ["C", "B"],
+    ["B", "A"],
+    ["B"],
+]
+# f1_references = [[rr] for rr in f1_references]
+instance_targets_f1 = [
+    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.33, "score": 0.33, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.33, "score": 0.33, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
+    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
+]
+
+
+global_target = {"group_mean_f1": 0.5,
+                 "score": 0.5,
+                 "score_name": "group_mean_f1",
+                 "group_mean_f1_ci_low": 0.32,
+                 "group_mean_f1_ci_high": 0.79,
+                 "score_ci_low": 0.32,
+                 "score_ci_high": 0.79,
+                 "group_mean_precision": 0.5,
+                 "group_mean_precision_ci_low": 0.32,
+                 "group_mean_precision_ci_high": 0.79,
+                 "group_mean_recall": 0.5,
+                 "group_mean_recall_ci_low": 0.32,
+                 "group_mean_recall_ci_high": 0.79}
+
+instance_targets_token_overlap = [{"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+                                  {"precision": 1.0,
+                                   "recall": 1.0,
+                                   "f1": 1.0,
+                                   "score": 1.0,
+                                   "score_name": "f1"}]
+
+
+
+metric = MeanGroupedTokenOverlap()
+
+outputs = test_metric(
+    metric=metric,
+    predictions=short_predictions,
+    references=short_references,
+    instance_targets=instance_targets_token_overlap,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_mean_f1_macro_multilabel", overwrite=True)

From 0f3f8289c82e7a7e78356acbe49e91a5dc16bf28 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 13:32:30 +0200
Subject: [PATCH 04/83] apply ruff formatting

---
 prepare/metrics/grouped_instance_metrics.py | 114 ++++++++------------
 tests/test_metrics.py                       |  64 ++++++-----
 2 files changed, 83 insertions(+), 95 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index a40a591fb8..0e29ca8ca9 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -1,5 +1,3 @@
-import numpy as np
-
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
     MeanGroupedAccuracy,
@@ -61,11 +59,8 @@
 
 
 instance_targets_string_containment = [
-    {"score": 1.0
-    },
-    {
-        "score": 1.0
-    },
+    {"score": 1.0},
+    {"score": 1.0},
     {
         "score": 0.0,
     },
@@ -108,7 +103,9 @@
 ]
 
 for instance in instance_targets_string_containment:
-    instance.update({"string_containment": instance["score"], "score_name": "string_containment"})
+    instance.update(
+        {"string_containment": instance["score"], "score_name": "string_containment"}
+    )
 
 instance_targets_accuracy = [
     {"score": 0.0},
@@ -185,10 +182,10 @@
     "group_pdr_accuracy": 0.83,
     "score": 0.83,
     "score_name": "group_pdr_accuracy",
-    "score_ci_low": np.nan,
-    "score_ci_high": np.nan,
-    "group_pdr_accuracy_ci_low": np.nan,
-    "group_pdr_accuracy_ci_high": np.nan,
+    "score_ci_low": 0.38,
+    "score_ci_high": 1.0,
+    "group_pdr_accuracy_ci_low": 0.38,
+    "group_pdr_accuracy_ci_high": 1.0,
 }
 
 
@@ -209,10 +206,10 @@
     "group_pdr_string_containment": 0.44,
     "score": 0.44,
     "score_name": "group_pdr_string_containment",
-    "score_ci_low": np.nan,
-    "score_ci_high": np.nan,
-    "group_pdr_string_containment_ci_low": np.nan,
-    "group_pdr_string_containment_ci_high": np.nan,
+    "score_ci_low": 0.14,
+    "score_ci_high": 1.0,
+    "group_pdr_string_containment_ci_low": 0.14,
+    "group_pdr_string_containment_ci_high": 1.0,
 }
 
 
@@ -284,60 +281,39 @@
 ]
 
 
-global_target = {"group_mean_f1": 0.5,
-                 "score": 0.5,
-                 "score_name": "group_mean_f1",
-                 "group_mean_f1_ci_low": 0.32,
-                 "group_mean_f1_ci_high": 0.79,
-                 "score_ci_low": 0.32,
-                 "score_ci_high": 0.79,
-                 "group_mean_precision": 0.5,
-                 "group_mean_precision_ci_low": 0.32,
-                 "group_mean_precision_ci_high": 0.79,
-                 "group_mean_recall": 0.5,
-                 "group_mean_recall_ci_low": 0.32,
-                 "group_mean_recall_ci_high": 0.79}
-
-instance_targets_token_overlap = [{"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-                                  {"precision": 1.0,
-                                   "recall": 1.0,
-                                   "f1": 1.0,
-                                   "score": 1.0,
-                                   "score_name": "f1"}]
+global_target = {
+    "group_mean_f1": 0.5,
+    "score": 0.5,
+    "score_name": "group_mean_f1",
+    "group_mean_f1_ci_low": 0.32,
+    "group_mean_f1_ci_high": 0.79,
+    "score_ci_low": 0.32,
+    "score_ci_high": 0.79,
+    "group_mean_precision": 0.5,
+    "group_mean_precision_ci_low": 0.32,
+    "group_mean_precision_ci_high": 0.79,
+    "group_mean_recall": 0.5,
+    "group_mean_recall_ci_low": 0.32,
+    "group_mean_recall_ci_high": 0.79,
+}
 
+instance_targets_token_overlap = [
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+]
 
 
 metric = MeanGroupedTokenOverlap()
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index eb1eaca1d0..230b8ec286 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -606,7 +606,6 @@ def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high):
                 )
 
     def test_grouped_instance_metric_confidence_interval(self):
-        import numpy as np
         """Test the calculation of confidence intervals for grouped instance metrics (a subclass of global metrics)."""
         self._test_grouped_instance_confidence_interval(
             metric=MeanGroupedAccuracy(),
@@ -622,14 +621,14 @@ def test_grouped_instance_metric_confidence_interval(self):
 
         self._test_grouped_instance_confidence_interval(
             metric=MeanGroupedAccuracyPDR(),
-            expected_ci_low=np.nan,
-            expected_ci_high=np.nan,
+            expected_ci_low=0.375,
+            expected_ci_high=1.0,
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=MeanGroupedStringContainmentPDR(),
-            expected_ci_low=np.nan,
-            expected_ci_high=np.nan,
+            expected_ci_low=0.14285714285714288,
+            expected_ci_high=1.0,
         )
 
         # F1-based scores
@@ -637,27 +636,33 @@ def test_grouped_instance_metric_confidence_interval(self):
             metric=MeanGroupedTokenOverlap(),
             references=GROUPED_INSTANCE_REFERENCES_SHORT,
             predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
-            expected_global_result={"group_mean_f1": 0.5,
-                                    "score": 0.5,
-                                    "score_name": "group_mean_f1",
-                                    "group_mean_f1_ci_low": 0.32199800893327996,
-                                    "group_mean_f1_ci_high": 0.7899498235031469,
-                                    "score_ci_low": 0.32199800893327996,
-                                    "score_ci_high": 0.7899498235031469,
-                                    "group_mean_precision": 0.5,
-                                    "group_mean_precision_ci_low": 0.32199800893327996,
-                                    "group_mean_precision_ci_high": 0.7899498235031469,
-                                    "group_mean_recall": 0.5,
-                                    "group_mean_recall_ci_low": 0.32199800893327996,
-                                    "group_mean_recall_ci_high": 0.7899498235031469}
-
+            expected_global_result={
+                "group_mean_f1": 0.5,
+                "score": 0.5,
+                "score_name": "group_mean_f1",
+                "group_mean_f1_ci_low": 0.32199800893327996,
+                "group_mean_f1_ci_high": 0.7899498235031469,
+                "score_ci_low": 0.32199800893327996,
+                "score_ci_high": 0.7899498235031469,
+                "group_mean_precision": 0.5,
+                "group_mean_precision_ci_low": 0.32199800893327996,
+                "group_mean_precision_ci_high": 0.7899498235031469,
+                "group_mean_recall": 0.5,
+                "group_mean_recall_ci_low": 0.32199800893327996,
+                "group_mean_recall_ci_high": 0.7899498235031469,
+            },
         )
 
     def _test_grouped_instance_confidence_interval(
-            self, metric, expected_ci_low=0.0, expected_ci_high=1.0,
-            references=GROUPED_INSTANCE_REFERENCES, predictions=GROUPED_INSTANCE_PREDICTIONS, expected_global_result=None
+        self,
+        metric,
+        expected_ci_low=0.0,
+        expected_ci_high=1.0,
+        references=GROUPED_INSTANCE_REFERENCES,
+        predictions=GROUPED_INSTANCE_PREDICTIONS,
+        expected_global_result=None,
     ):
-        """Test the calculation of confidence intervals for a given metric."""
+        """Test the calculation of confidence intervals for a given metric with group_mean reduction."""
         import numpy as np
 
         outputs = apply_metric(
@@ -667,7 +672,13 @@ def _test_grouped_instance_confidence_interval(
             additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
         )
 
-        group_score_name = "_".join(["group", metric.reduction_map["group_mean"]["agg_func"][0], metric.main_score])
+        group_score_name = "_".join(
+            [
+                "group",
+                metric.reduction_map["group_mean"]["agg_func"][0],
+                metric.main_score,
+            ]
+        )
 
         if expected_global_result is None:
             expected_global_result = {
@@ -690,7 +701,10 @@ def _test_grouped_instance_confidence_interval(
                         assert np.isnan(expected_global_result[score_name])
                     else:
                         self.assertAlmostEqual(
-                            score_value, expected_global_result[score_name], places=5, msg=f"score mismatch for {group_score_name}"
+                            score_value,
+                            expected_global_result[score_name],
+                            places=5,
+                            msg=f"score mismatch for {group_score_name}",
                         )
                 else:
                     self.assertEqual(score_value, expected_global_result[score_name])
@@ -702,5 +716,3 @@ def _test_grouped_instance_confidence_interval(
                     "ci_low" not in score_name and "ci_high" not in score_name,
                     msg=f"Unexpected confidence interval score '{score_name}'.",
                 )
-
-

From a31697237618d644f45ba5e9f14c2411366c3e05 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 13:32:39 +0200
Subject: [PATCH 05/83] apply ruff formatting, reduce complexity

---
 src/unitxt/metrics.py | 159 +++++++++++++++++++++++++++++-------------
 1 file changed, 112 insertions(+), 47 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 1296911ca7..bc41e7b031 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -123,33 +123,45 @@ def _can_compute_confidence_intervals(self, num_predictions):
     #             result["score_ci_high"] = ci.high
     #     return result
 
-
-    def score_based_confidence_interval(self, instances, aggregation_func=None, score_names: Optional[List[str]] = None, func_name=""):
+    def score_based_confidence_interval(
+        self,
+        instances,
+        aggregation_func=None,
+        score_names: Optional[List[str]] = None,
+        func_name="",
+    ):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
 
+        Unlike GlobalMetric, this is simply a function of the instance scores (possibly taking into account additional_inputs field),
+         so they don't need to be recomputed after every bootstrap draw.
+
         score_names: List[str]
             Compute a confidence interval for each score_name from this list.
         instances:
             The instances for which the confidence intervals are computed.
+        aggregation_func:
+            A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
+            field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
+            instances after resampling, if aggregation_func=None.
         """
         result = {}
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
+        # resample the indices of the instances, which contain the scores
         identifiers = list(range(len(instances)))
 
-
-
         if score_names is None:
             score_names = (
                 self.ci_scores if self.ci_scores is not None else [self.main_score]
             )
-            if aggregation_func is not None:
-                def aggregation_func(instances, field_name):
-                    return mean([instance[field_name] for instance in instances])
+        if aggregation_func is None:
 
+            def aggregation_func(instances, field_name):
+                return mean([instance[field_name] for instance in instances])
 
         for score_name in score_names:
+            # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
@@ -163,14 +175,16 @@ def metric(instances):
                         return np.nan
 
                 scores = numpy.apply_along_axis(
-                    lambda x: metric([instances[ii] for ii in x],
-                                     ),
+                    lambda x: metric(
+                        [instances[ii] for ii in x],
+                    ),
                     axis=axis,
                     arr=arr,
                 )
-                return scores
 
+                return self.resample_from_non_nan(scores)
 
+            # apply bootstrap only on the relevant field
             ci = bootstrap(
                 (identifiers,),
                 statistic=statistic,
@@ -178,7 +192,11 @@ def metric(instances):
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
-            full_score_name = score_name if len(func_name) == 0 else "_".join([str(func_name), score_name])
+            full_score_name = (
+                score_name
+                if len(func_name) == 0
+                else "_".join([str(func_name), score_name])
+            )
             result[f"{full_score_name}_ci_low"] = ci.low
             result[f"{full_score_name}_ci_high"] = ci.high
             if score_name == self.main_score:
@@ -186,9 +204,16 @@ def metric(instances):
                 result["score_ci_high"] = ci.high
         return result
 
-
-
-
+    def resample_from_non_nan(self, values):
+        if values.size > 1:
+            error_indices = numpy.isnan(values)
+            n_errors = sum(error_indices)
+            if 0 < n_errors < values.size:
+                # replace NaN aggregate scores with random draws from non-NaN scores, so that confidence interval isn't NaN itself
+                values[error_indices] = self.new_random_generator().choice(
+                    values[~error_indices], n_errors, replace=True
+                )
+        return values
 
     def compute_global_confidence_intervals(
         self, references, predictions, additional_inputs, score_name
@@ -437,7 +462,9 @@ def compute(
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
 
-    implemented_reductions: List[str] = field(default_factory=lambda: ["mean", "group_mean"])
+    implemented_reductions: List[str] = field(
+        default_factory=lambda: ["mean", "group_mean"]
+    )
 
     # for grouped metrics: a field that contains the group id. None to disable grouping.
     # Grouped metrics aggregate the instance score per group, and then average over group scores.
@@ -448,8 +475,34 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     def reduction_map(self) -> dict:
         pass
 
-    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+    def _validate_group_mean_reduction(self):
+        if "group_mean" in self.reduction_map:
+            # for group_mean, expects a dict
+            fields = self.reduction_map["group_mean"]
+            if not self.grouping_field:
+                raise ValueError(
+                    "self.grouping_field is None, . "
+                    "This field is required for group based metric computation."
+                )
 
+            assert isinstance(fields, dict)
+            assert (
+                "agg_func" in fields
+            ), "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
+            assert callable(
+                fields["agg_func"][1]
+            ), "second item in fields['agg_func'] should be a callable function"
+            if "score_fields" in fields:
+                assert isinstance(fields["score_fields"], list)
+
+            return (
+                [self.main_score]
+                if "score_fields" not in fields
+                else fields["score_fields"]
+            )
+        return [self.main_score]
+
+    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream, stream_name)
 
         for reduction, fields in self.reduction_map.items():
@@ -462,22 +515,20 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 aggregation_func = self.aggregate
 
             if reduction == "group_mean":
-                if not self.grouping_field:
-                    raise ValueError("self.grouping_field is None, . "
-                                     "This field is required for group based metric computation.")
-                # # for group_mean, expects a dict
-                assert isinstance(fields, dict)
-                assert "agg_func" in fields, "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
-                assert callable(fields["agg_func"][1]), "second item in fields['agg_func'] should be a callable function"
-                score_fields = [self.main_score] if "score_fields" not in fields else fields["score_fields"]
-                def aggregation_func(instances, field_name, field=fields["agg_func"][1]):
+                score_fields = self._validate_group_mean_reduction()
+
+                def aggregation_func(
+                    instances, field_name, field=fields["agg_func"][1]
+                ):
                     return self.grouped_aggregate(instances, field_name, field)
 
             if not aggregation_func:
-                raise ValueError(f"No aggregation_func was defined for reduction {reduction}. "
-                                 f"Please specify a valid reduction method in reduction_map {self.reduction_map}.")
+                raise ValueError(
+                    f"No aggregation_func was defined for reduction {reduction}. "
+                    f"Please specify a valid reduction method in reduction_map {self.reduction_map}."
+                )
 
-            for field_name in (score_fields if reduction == "group_mean" else fields):
+            for field_name in score_fields if reduction == "group_mean" else fields:
                 if reduction == "group_mean":
                     field_name_full_prefix = "group_" + str(fields["agg_func"][0])
                     field_name_full = "_".join([field_name_full_prefix, field_name])
@@ -488,20 +539,25 @@ def aggregation_func(instances, field_name, field=fields["agg_func"][1]):
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
                     global_score["score_name"] = field_name_full
-                # lambda instances, score_name: [fv for fv in [aggregation_func(instances=instances, field_name=score_name)] if not np.isnan(fv)]
-                def bootstrap_aggregation_func(instances, field_name, agg_func=aggregation_func):
+
+                def bootstrap_aggregation_func(
+                    instances, field_name, agg_func=aggregation_func
+                ):
                     return agg_func(instances=instances, field_name=field_name)
 
                 confidence_interval = self.score_based_confidence_interval(
-                    instances=instances, aggregation_func=bootstrap_aggregation_func,
-                    score_names=[field_name], func_name=field_name_full_prefix
+                    instances=instances,
+                    aggregation_func=bootstrap_aggregation_func,
+                    score_names=[field_name],
+                    func_name=field_name_full_prefix,
                 )
                 global_score.update(confidence_interval)
 
-        for instance in instances:
-            yield from instance
+        yield from instances
 
-    def compute_instance_scores(self, stream: Stream, stream_name: Optional[str] = None):
+    def compute_instance_scores(
+        self, stream: Stream, stream_name: Optional[str] = None
+    ):
         global_score = {}
         instances = []
 
@@ -525,17 +581,11 @@ def compute_instance_scores(self, stream: Stream, stream_name: Optional[str] = N
 
             instances.append(instance)
 
-        for instance in instances:
-            yield instance
-
         return instances, global_score
 
     @staticmethod
     def aggregate(instances, field_name):
-        scores = [
-            instance["score"]["instance"][field_name]
-            for instance in instances
-        ]
+        scores = [instance["score"]["instance"][field_name] for instance in instances]
         return np.nanmean(scores)
 
     def grouped_aggregate(self, instances, field_name, aggregation_func):
@@ -543,9 +593,13 @@ def grouped_aggregate(self, instances, field_name, aggregation_func):
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
             if self.grouping_field not in additional_inputs:
-                raise ValueError(f"Missing '{self.grouping_field}' from instance {instance}. "
-                                 f"This field is required for group based metric computation.")
-            group_key = additional_inputs[self.grouping_field]  # do we need to convert to str?
+                raise ValueError(
+                    f"Missing '{self.grouping_field}' from instance {instance}. "
+                    f"This field is required for group based metric computation."
+                )
+            group_key = additional_inputs[
+                self.grouping_field
+            ]  # do we need to convert to str?
             group_to_instance_scores[group_key].append(
                 instance["score"]["instance"][field_name]
             )
@@ -553,7 +607,9 @@ def grouped_aggregate(self, instances, field_name, aggregation_func):
         group_total_scores = [
             aggregation_func(scores) for scores in group_to_instance_scores.values()
         ]
-        group_total_scores = [score for score in group_total_scores if not np.isnan(score)]
+        group_total_scores = [
+            score for score in group_total_scores if not np.isnan(score)
+        ]
         # ignore NaNs in aggregation
         return mean(group_total_scores) if len(group_total_scores) else np.nan
 
@@ -1597,18 +1653,27 @@ class MeanGroupedAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
 
+
 class MeanGroupedAccuracyPDR(Accuracy):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
 
+
 class MeanGroupedStringContainment(StringContainment):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean]}}
 
+
 class MeanGroupedStringContainmentPDR(StringContainment):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
 
+
 class MeanGroupedTokenOverlap(TokenOverlap):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean], "score_fields": ["f1", "precision", "recall"]}}
+    reduction_map = {
+        "group_mean": {
+            "agg_func": ["mean", np.nanmean],
+            "score_fields": ["f1", "precision", "recall"],
+        }
+    }

From ffa4e1dee232508a6b323105dc2783c8c750d60c Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 14:15:13 +0200
Subject: [PATCH 06/83] merge with main

---
 src/unitxt/metrics.py | 198 +++++++++++++++++++++++++++---------------
 1 file changed, 127 insertions(+), 71 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index bc41e7b031..1b31c5c9f4 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2,7 +2,7 @@
 import string
 import uuid
 from abc import abstractmethod
-from collections import Counter, defaultdict
+from collections import Counter
 from dataclasses import field
 from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -83,46 +83,6 @@ def _can_compute_confidence_intervals(self, num_predictions):
             and num_predictions > 1
         )
 
-    # def score_based_confidence_interval(self, instances, statistic=None, score_names: Optional[List[str]] = None, func_name=""):
-    #     """Compute confidence intervals based on existing scores, already computed on the input instances.
-    #
-    #     score_names: List[str]
-    #         Compute a confidence interval for each score_name from this list.
-    #     instances:
-    #         The instances for which the confidence intervals are computed.
-    #     """
-    #     result = {}
-    #
-    #     if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
-    #         return result
-    #
-    #     if score_names is None:
-    #         score_names = (
-    #             self.ci_scores if self.ci_scores is not None else [self.main_score]
-    #         )
-    #         if statistic is not None:
-    #             def statistic(instances, field_name):
-    #                 return mean([instance[field_name] for instance in instances])
-    #
-    #     for score_name in score_names:
-    #         def statistic_wrap(x, field_name=score_name):
-    #             return statistic(instances=x, field_name=field_name)
-    #
-    #         ci = bootstrap(
-    #             (instances,),
-    #             statistic=statistic_wrap,#lambda x: statistic(instances=x, field_name=score_name),
-    #             n_resamples=self.n_resamples,
-    #             confidence_level=self.confidence_level,
-    #             random_state=self.new_random_generator(),
-    #         ).confidence_interval
-    #         full_score_name = score_name if len(func_name) == 0 else "_".join([str(func_name), score_name])
-    #         result[f"{full_score_name}_ci_low"] = ci.low
-    #         result[f"{full_score_name}_ci_high"] = ci.high
-    #         if score_name == self.main_score:
-    #             result["score_ci_low"] = ci.low
-    #             result["score_ci_high"] = ci.high
-    #     return result
-
     def score_based_confidence_interval(
         self,
         instances,
@@ -589,6 +549,8 @@ def aggregate(instances, field_name):
         return np.nanmean(scores)
 
     def grouped_aggregate(self, instances, field_name, aggregation_func):
+        from collections import defaultdict
+
         group_to_instance_scores = defaultdict(list)
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
@@ -724,10 +686,29 @@ class HuggingfaceMetric(GlobalMetric):
 
     scale: float = 1.0  # optional scaling of main results
     scaled_fields: list = None
+    # This are fixed arguments  passed to compute method
     hf_compute_args: Dict[str, Any] = OptionalField(default_factory=dict)
+    # These are additional input fields passed to HF compute method (a list with one value per instance)
     hf_additional_input_fields: List = OptionalField(default_factory=list)
+    # These are additional input fields that are passed as one value
+    hf_additional_input_fields_pass_one_value: List = OptionalField(
+        default_factory=list
+    )
+
     experiment_id: str = OptionalField(default_factory=lambda: str(uuid.uuid4()))
 
+    def verify(self):
+        assert (
+            self.hf_additional_input_fields is None
+            or isoftype(self.hf_additional_input_fields, List[str])
+        ), f"Argument hf_additional_input_fields should be either None or List[str]. It is now: {self.hf_additional_input_fields}."
+        assert (
+            self.hf_additional_input_fields_pass_one_value is None
+            or isoftype(self.hf_additional_input_fields_pass_one_value, List[str])
+        ), f"Argument hf_additional_input_fields_pass_one_value should be either None or List[str]. It is now: {self.hf_additional_input_fields_pass_one_value}."
+
+        return super().verify()
+
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
@@ -749,8 +730,22 @@ def compute(
                 additional_input[additional_input_field]
                 for additional_input in additional_inputs
             ]
-        # add check that all required fields in self.metrics are in passed_additional_inputs
+        for additional_input_field in self.hf_additional_input_fields_pass_one_value:
+            assert (
+                additional_input_field in additional_inputs[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
+
+            values = {
+                additional_input[additional_input_field]
+                for additional_input in additional_inputs
+            }
+            assert (
+                len(values) == 1
+            ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
 
+            passed_additional_inputs[additional_input_field] = next(iter(values))
+
+        # add check that all required fields in self.metrics are in passed_additional_inputs       print(passed_additional_inputs)
         result = self.metric.compute(
             predictions=predictions,
             references=references,
@@ -899,10 +894,11 @@ class F1MultiLabel(GlobalMetric):
     main_score = "f1_macro"
     average = None  # Report per class then aggregate by mean
     classes_to_ignore = ["none"]
+    metric = "f1"
 
     def prepare(self):
         super().prepare()
-        self._metric = evaluate.load("f1", "multilabel")
+        self._metric = evaluate.load(self.metric, "multilabel")
 
     def add_str_to_id(self, str):
         if str not in self.str_to_id:
@@ -963,17 +959,17 @@ def compute(
             average=self.average,
             labels=labels_param,
         )
-        if isinstance(result["f1"], numpy.ndarray):
+        if isinstance(result[self.metric], numpy.ndarray):
             from statistics import mean
 
-            assert len(result["f1"]) == len(
-                labels
-            ), f'F1 result ({result["f1"]}) has more entries than labels ({labels})'
-            final_result = {self.main_score: mean(result["f1"])}
+            assert (
+                len(result[self.metric]) == len(labels)
+            ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
+            final_result = {self.main_score: mean(result[self.metric])}
             for i, label in enumerate(labels):
-                final_result["f1_" + label] = result["f1"][i]
+                final_result[self.metric + "_" + label] = result[self.metric][i]
         else:
-            final_result = {self.main_score: result["f1"]}
+            final_result = {self.main_score: result[self.metric]}
         return final_result
 
     def _validate_references_and_prediction(self, references, predictions):
@@ -994,6 +990,30 @@ def _validate_references_and_prediction(self, references, predictions):
                 )
 
 
+class PrecisionMacroMultiLabel(F1MultiLabel):
+    main_score = "precision_macro"
+    metric = "precision"
+    average = "macro"
+
+
+class PrecisionMicroMultiLabel(F1MultiLabel):
+    main_score = "precision_micro"
+    metric = "precision"
+    average = "micro"
+
+
+class RecallMacroMultiLabel(F1MultiLabel):
+    main_score = "recall_macro"
+    metric = "recall"
+    average = "macro"
+
+
+class RecallMicroMultiLabel(F1MultiLabel):
+    main_score = "recall_micro"
+    metric = "recall"
+    average = "micro"
+
+
 class F1MicroMultiLabel(F1MultiLabel):
     main_score = "f1_micro"
     average = "micro"
@@ -1116,27 +1136,36 @@ def compute(
 
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
-    classes = None
+    groups = None
     zero_division = 0.0
 
     @abstractmethod
-    def get_element_group(self, element):
+    def get_element_group(self, element, additional_input):
         pass
 
     @abstractmethod
-    def get_element_representation(self, element):
+    def get_element_representation(self, element, additional_input):
         pass
 
-    def group_elements(self, elements_list):
+    def should_ignore_element(self, element, additional_input):
+        return False
+
+    def group_elements(self, elements_list, additional_input):
+        if not isinstance(elements_list, list):
+            elements_list = [elements_list]
         return {
             k: Counter(
                 [
-                    self.get_element_representation(value)
+                    self.get_element_representation(value, additional_input)
                     for value in elements_list
-                    if self.get_element_group(value) == k
+                    if self.get_element_group(value, additional_input) == k
                 ]
             )
-            for k in {self.get_element_group(e) for e in elements_list}
+            for k in {
+                self.get_element_group(e, additional_input)
+                for e in elements_list
+                if not self.should_ignore_element(e, additional_input)
+            }
         }
 
     def calculate_groups_ratio(self, actual_group, total_group):
@@ -1158,30 +1187,46 @@ def f1(self, pn, pd, rn, rd):
         except ZeroDivisionError:
             return self.zero_division
 
+    def get_groups(self, elements, additional_inputs):
+        groups = set()
+        for sublist, additional_input in zip(elements, additional_inputs):
+            for e in sublist:
+                if self.should_ignore_element(e, additional_input):
+                    continue
+                groups.add(self.get_element_group(e, additional_input))
+        return groups
+
     def compute(
         self,
-        references: List[Any],
+        references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
     ) -> dict:
         # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
-        if isinstance(references[0], list) and isinstance(references[0][0], list):
+        if (
+            isinstance(references[0], list)
+            and len(references[0]) > 0
+            and isinstance(references[0][0], list)
+        ):
             references = [element[0] for element in references]
 
         assert len(references) == len(predictions), (
             f"references size ({len(references)})"
             f" doesn't mach predictions sise ({len(references)})."
         )
-        if self.classes is None:
-            classes = {
-                self.get_element_group(e) for sublist in references for e in sublist
-            }
+
+        if self.groups is None:
+            groups = self.get_groups(references, additional_inputs)
         else:
-            classes = self.classes
+            groups = self.groups
         groups_statistics = {}
-        for references_batch, predictions_batch in zip(references, predictions):
-            grouped_references = self.group_elements(references_batch)
-            grouped_predictions = self.group_elements(predictions_batch)
+        for references_batch, predictions_batch, additional_input in zip(
+            references, predictions, additional_inputs
+        ):
+            grouped_references = self.group_elements(references_batch, additional_input)
+            grouped_predictions = self.group_elements(
+                predictions_batch, additional_input
+            )
             all_groups = set(grouped_references.keys()).union(
                 grouped_predictions.keys()
             )
@@ -1224,7 +1269,7 @@ def compute(
                 rn_total + rn,
                 rd_total + rd,
             )
-            if group in classes:
+            if group in groups:
                 f1_result[f"f1_{group}"] = self.f1(pn, pd, rn, rd)
                 recall_result[f"recall_{group}"] = self.recall(pn, pd, rn, rd)
                 precision_result[f"precision_{group}"] = self.precision(pn, pd, rn, rd)
@@ -1243,7 +1288,7 @@ def compute(
         except ZeroDivisionError:
             result["f1_macro"] = self.zero_division
             result["recall_macro"] = self.zero_division
-            result["micro_macro"] = self.zero_division
+            result["precision_macro"] = self.zero_division
 
         amount_of_predictions = pd_total
         if amount_of_predictions == 0:
@@ -1261,10 +1306,10 @@ def compute(
 
 
 class NER(CustomF1):
-    def get_element_group(self, element):
+    def get_element_group(self, element, additional_input):
         return element[1]
 
-    def get_element_representation(self, element):
+    def get_element_representation(self, element, additional_input):
         return str(element)
 
 
@@ -1626,6 +1671,17 @@ def _compute(
         return result
 
 
+class KPA(CustomF1):
+    def get_element_group(self, element, additional_input):
+        return additional_input["keypoint"]
+
+    def get_element_representation(self, element, additional_input):
+        return additional_input["keypoint"]
+
+    def should_ignore_element(self, element, additional_input):
+        return element == "none"
+
+
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
 def performance_drop_rate(instance_scores: List):
     """Percentage change of mean performance on test elements relative to that on a baseline.

From 01914ff491ec4402bdaf583f4ad69f047907441c Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 14:21:44 +0200
Subject: [PATCH 07/83] initial commit

---
 src/unitxt/catalog/metrics/group_mean_accuracy.json            | 3 +++
 src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json | 3 +++
 src/unitxt/catalog/metrics/group_mean_string_containment.json  | 3 +++
 src/unitxt/catalog/metrics/group_pdr_accuracy.json             | 3 +++
 src/unitxt/catalog/metrics/group_pdr_string_containment.json   | 3 +++
 5 files changed, 15 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/group_mean_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json
 create mode 100644 src/unitxt/catalog/metrics/group_mean_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/group_pdr_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/group_pdr_string_containment.json

diff --git a/src/unitxt/catalog/metrics/group_mean_accuracy.json b/src/unitxt/catalog/metrics/group_mean_accuracy.json
new file mode 100644
index 0000000000..6344b67ead
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_mean_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "mean_grouped_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json b/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json
new file mode 100644
index 0000000000..1110227ddc
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json
@@ -0,0 +1,3 @@
+{
+    "type": "mean_grouped_token_overlap"
+}
diff --git a/src/unitxt/catalog/metrics/group_mean_string_containment.json b/src/unitxt/catalog/metrics/group_mean_string_containment.json
new file mode 100644
index 0000000000..cbb1a7ff75
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_mean_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "mean_grouped_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/group_pdr_accuracy.json b/src/unitxt/catalog/metrics/group_pdr_accuracy.json
new file mode 100644
index 0000000000..441f2249fa
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_pdr_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "mean_grouped_accuracy_pdr"
+}
diff --git a/src/unitxt/catalog/metrics/group_pdr_string_containment.json b/src/unitxt/catalog/metrics/group_pdr_string_containment.json
new file mode 100644
index 0000000000..f02205abf2
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_pdr_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "mean_grouped_string_containment_pdr"
+}

From 7d98ec5ce8366065b02391fd6487e178025d649a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 15:16:54 +0200
Subject: [PATCH 08/83] rename grouped instance metrics so artifact type and
 name correspond

---
 src/unitxt/catalog/metrics/group_mean_accuracy.json           | 2 +-
 src/unitxt/catalog/metrics/group_mean_string_containment.json | 2 +-
 src/unitxt/catalog/metrics/group_mean_token_overlap.json      | 3 +++
 src/unitxt/catalog/metrics/group_pdr_accuracy.json            | 2 +-
 src/unitxt/catalog/metrics/group_pdr_string_containment.json  | 2 +-
 5 files changed, 7 insertions(+), 4 deletions(-)
 create mode 100644 src/unitxt/catalog/metrics/group_mean_token_overlap.json

diff --git a/src/unitxt/catalog/metrics/group_mean_accuracy.json b/src/unitxt/catalog/metrics/group_mean_accuracy.json
index 6344b67ead..6aa448f66e 100644
--- a/src/unitxt/catalog/metrics/group_mean_accuracy.json
+++ b/src/unitxt/catalog/metrics/group_mean_accuracy.json
@@ -1,3 +1,3 @@
 {
-    "type": "mean_grouped_accuracy"
+    "type": "group_mean_accuracy"
 }
diff --git a/src/unitxt/catalog/metrics/group_mean_string_containment.json b/src/unitxt/catalog/metrics/group_mean_string_containment.json
index cbb1a7ff75..0d34e5d851 100644
--- a/src/unitxt/catalog/metrics/group_mean_string_containment.json
+++ b/src/unitxt/catalog/metrics/group_mean_string_containment.json
@@ -1,3 +1,3 @@
 {
-    "type": "mean_grouped_string_containment"
+    "type": "group_mean_string_containment"
 }
diff --git a/src/unitxt/catalog/metrics/group_mean_token_overlap.json b/src/unitxt/catalog/metrics/group_mean_token_overlap.json
new file mode 100644
index 0000000000..4487385870
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_mean_token_overlap.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_mean_token_overlap"
+}
diff --git a/src/unitxt/catalog/metrics/group_pdr_accuracy.json b/src/unitxt/catalog/metrics/group_pdr_accuracy.json
index 441f2249fa..f56a12e782 100644
--- a/src/unitxt/catalog/metrics/group_pdr_accuracy.json
+++ b/src/unitxt/catalog/metrics/group_pdr_accuracy.json
@@ -1,3 +1,3 @@
 {
-    "type": "mean_grouped_accuracy_pdr"
+    "type": "group_pdr_accuracy"
 }
diff --git a/src/unitxt/catalog/metrics/group_pdr_string_containment.json b/src/unitxt/catalog/metrics/group_pdr_string_containment.json
index f02205abf2..c1bd327dd0 100644
--- a/src/unitxt/catalog/metrics/group_pdr_string_containment.json
+++ b/src/unitxt/catalog/metrics/group_pdr_string_containment.json
@@ -1,3 +1,3 @@
 {
-    "type": "mean_grouped_string_containment_pdr"
+    "type": "group_pdr_string_containment"
 }

From b99694a50f03815608a44da488c0d77a476d7dd5 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 17:15:19 +0200
Subject: [PATCH 09/83] rename grouped instance metrics so artifact type and
 name correspond

---
 tests/test_metrics.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 230b8ec286..5098d4ff6d 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -9,11 +9,11 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
-    MeanGroupedAccuracy,
-    MeanGroupedAccuracyPDR,
-    MeanGroupedStringContainment,
-    MeanGroupedStringContainmentPDR,
-    MeanGroupedTokenOverlap,
+    GroupMeanAccuracy,
+    GroupMeanStringContainment,
+    GroupMeanTokenOverlap,
+    GroupPDRAccuracy,
+    GroupPDRStringContainment,
     Rouge,
     Squad,
     TokenOverlap,
@@ -480,10 +480,10 @@ def test_token_overlap(self):
 
     def test_grouped_instance_metrics(self):
         accuracy_metrics = [
-            MeanGroupedAccuracy(),
-            MeanGroupedStringContainment(),
-            MeanGroupedAccuracyPDR(),
-            MeanGroupedStringContainmentPDR(),
+            GroupMeanAccuracy(),
+            GroupMeanStringContainment(),
+            GroupPDRAccuracy(),
+            GroupPDRStringContainment(),
         ]
         global_targets = [0.225, 0.4875, 0.8333333333333334, 0.4444444444444445]
         for metric, target in zip(accuracy_metrics, global_targets):
@@ -499,7 +499,7 @@ def test_grouped_instance_metrics(self):
                 msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
             )
 
-        f1_metrics = [MeanGroupedTokenOverlap()]
+        f1_metrics = [GroupMeanTokenOverlap()]
         global_targets = [0.5]
         for metric, target in zip(f1_metrics, global_targets):
             outputs = apply_metric(
@@ -608,32 +608,32 @@ def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high):
     def test_grouped_instance_metric_confidence_interval(self):
         """Test the calculation of confidence intervals for grouped instance metrics (a subclass of global metrics)."""
         self._test_grouped_instance_confidence_interval(
-            metric=MeanGroupedAccuracy(),
+            metric=GroupMeanAccuracy(),
             expected_ci_low=0.025,
             expected_ci_high=0.44047619047619047,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=MeanGroupedStringContainment(),
+            metric=GroupMeanStringContainment(),
             expected_ci_low=0.15627449950197503,
             expected_ci_high=0.7080527276705951,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=MeanGroupedAccuracyPDR(),
+            metric=GroupPDRAccuracy(),
             expected_ci_low=0.375,
             expected_ci_high=1.0,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=MeanGroupedStringContainmentPDR(),
+            metric=GroupPDRStringContainment(),
             expected_ci_low=0.14285714285714288,
             expected_ci_high=1.0,
         )
 
         # F1-based scores
         self._test_grouped_instance_confidence_interval(
-            metric=MeanGroupedTokenOverlap(),
+            metric=GroupMeanTokenOverlap(),
             references=GROUPED_INSTANCE_REFERENCES_SHORT,
             predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
             expected_global_result={

From 735ce41ed7eba16af42cefb2743f25fada2ab54c Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 17:15:26 +0200
Subject: [PATCH 10/83] rename grouped instance metrics so artifact type and
 name correspond

---
 prepare/metrics/grouped_instance_metrics.py | 40 ++++++---------------
 src/unitxt/metrics.py                       | 10 +++---
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 0e29ca8ca9..a08579dc97 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -1,10 +1,10 @@
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
-    MeanGroupedAccuracy,
-    MeanGroupedAccuracyPDR,
-    MeanGroupedStringContainment,
-    MeanGroupedStringContainmentPDR,
-    MeanGroupedTokenOverlap,
+    GroupMeanAccuracy,
+    GroupMeanStringContainment,
+    GroupMeanTokenOverlap,
+    GroupPDRAccuracy,
+    GroupPDRStringContainment,
 )
 from src.unitxt.test_utils.metrics import test_metric
 
@@ -128,7 +128,7 @@
 for instance in instance_targets_accuracy:
     instance.update({"accuracy": instance["score"], "score_name": "accuracy"})
 
-metric = MeanGroupedAccuracy()
+metric = GroupMeanAccuracy()
 global_target = {
     "group_mean_accuracy": 0.23,
     "score": 0.23,
@@ -152,7 +152,7 @@
 add_to_catalog(metric, "metrics.group_mean_accuracy", overwrite=True)
 
 
-metric = MeanGroupedStringContainment()
+metric = GroupMeanStringContainment()
 global_target = {
     "group_mean_string_containment": 0.49,
     "score": 0.49,
@@ -177,7 +177,7 @@
 
 
 # PDR
-metric = MeanGroupedAccuracyPDR()
+metric = GroupPDRAccuracy()
 global_target = {
     "group_pdr_accuracy": 0.83,
     "score": 0.83,
@@ -201,7 +201,7 @@
 add_to_catalog(metric, "metrics.group_pdr_accuracy", overwrite=True)
 
 
-metric = MeanGroupedStringContainmentPDR()
+metric = GroupPDRStringContainment()
 global_target = {
     "group_pdr_string_containment": 0.44,
     "score": 0.44,
@@ -261,24 +261,6 @@
     ["B", "A"],
     ["B"],
 ]
-# f1_references = [[rr] for rr in f1_references]
-instance_targets_f1 = [
-    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.33, "score": 0.33, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.33, "score": 0.33, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.0, "score": 0.0, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 0.5, "score": 0.5, "score_name": "group_mean_f1_macro"},
-    {"group_mean_f1_macro": 1.0, "score": 1.0, "score_name": "group_mean_f1_macro"},
-]
 
 
 global_target = {
@@ -316,7 +298,7 @@
 ]
 
 
-metric = MeanGroupedTokenOverlap()
+metric = GroupMeanTokenOverlap()
 
 outputs = test_metric(
     metric=metric,
@@ -327,4 +309,4 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_mean_f1_macro_multilabel", overwrite=True)
+add_to_catalog(metric, "metrics.group_mean_token_overlap", overwrite=True)
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 1b31c5c9f4..dc69f62f83 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -1705,27 +1705,27 @@ def performance_drop_rate(instance_scores: List):
     )
 
 
-class MeanGroupedAccuracy(Accuracy):
+class GroupMeanAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
 
 
-class MeanGroupedAccuracyPDR(Accuracy):
+class GroupPDRAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
 
 
-class MeanGroupedStringContainment(StringContainment):
+class GroupMeanStringContainment(StringContainment):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean]}}
 
 
-class MeanGroupedStringContainmentPDR(StringContainment):
+class GroupPDRStringContainment(StringContainment):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
 
 
-class MeanGroupedTokenOverlap(TokenOverlap):
+class GroupMeanTokenOverlap(TokenOverlap):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean": {

From 730ff45e08057ec096e4acd0f74e387731765709 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 8 Jan 2024 18:51:05 +0200
Subject: [PATCH 11/83] remove newline formatting

---
 src/unitxt/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/version.py b/src/unitxt/version.py
index 841aad2c60..60c886f96c 100644
--- a/src/unitxt/version.py
+++ b/src/unitxt/version.py
@@ -1 +1 @@
-version = "1.4.2"
+version = "1.4.2"
\ No newline at end of file

From 0aaa1daa7dbb492a3aa4c7fcd69885cfc81f24f6 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 10 Jan 2024 17:59:28 +0200
Subject: [PATCH 12/83] remove (catalog from removed metric)

---
 src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json

diff --git a/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json b/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json
deleted file mode 100644
index 1110227ddc..0000000000
--- a/src/unitxt/catalog/metrics/group_mean_f1_macro_multilabel.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "mean_grouped_token_overlap"
-}

From 2d52c54b08bbec2cf65dcd93f7826523660c8f82 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 11 Jan 2024 21:06:55 +0200
Subject: [PATCH 13/83] fix some variation in expected values

---
 prepare/metrics/grouped_instance_metrics.py |  4 ++--
 tests/test_metrics.py                       | 22 ++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index a08579dc97..e83c44f149 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -130,8 +130,8 @@
 
 metric = GroupMeanAccuracy()
 global_target = {
-    "group_mean_accuracy": 0.23,
-    "score": 0.23,
+    "group_mean_accuracy": 0.22,
+    "score": 0.22,
     "score_name": "group_mean_accuracy",
     "score_ci_low": 0.02,
     "score_ci_high": 0.44,
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 5098d4ff6d..bc13d081e9 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -610,13 +610,13 @@ def test_grouped_instance_metric_confidence_interval(self):
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanAccuracy(),
             expected_ci_low=0.025,
-            expected_ci_high=0.44047619047619047,
+            expected_ci_high=0.44105968464125495,
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanStringContainment(),
-            expected_ci_low=0.15627449950197503,
-            expected_ci_high=0.7080527276705951,
+            expected_ci_low=0.15556138609239942,
+            expected_ci_high=0.707936507936508,
         )
 
         self._test_grouped_instance_confidence_interval(
@@ -640,16 +640,16 @@ def test_grouped_instance_metric_confidence_interval(self):
                 "group_mean_f1": 0.5,
                 "score": 0.5,
                 "score_name": "group_mean_f1",
-                "group_mean_f1_ci_low": 0.32199800893327996,
-                "group_mean_f1_ci_high": 0.7899498235031469,
-                "score_ci_low": 0.32199800893327996,
-                "score_ci_high": 0.7899498235031469,
+                "group_mean_f1_ci_low": 0.32222222222222224,
+                "group_mean_f1_ci_high": 0.7900160821100434,
+                "score_ci_low": 0.32222222222222224,
+                "score_ci_high": 0.7900160821100434,
                 "group_mean_precision": 0.5,
-                "group_mean_precision_ci_low": 0.32199800893327996,
-                "group_mean_precision_ci_high": 0.7899498235031469,
+                "group_mean_precision_ci_low": 0.32222222222222224,
+                "group_mean_precision_ci_high": 0.7900160821100434,
                 "group_mean_recall": 0.5,
-                "group_mean_recall_ci_low": 0.32199800893327996,
-                "group_mean_recall_ci_high": 0.7899498235031469,
+                "group_mean_recall_ci_low": 0.32222222222222224,
+                "group_mean_recall_ci_high": 0.7900160821100434,
             },
         )
 

From 510d6e88fac9ea475dc0b4c120ce1cb6eefca3dd Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 11 Jan 2024 21:07:50 +0200
Subject: [PATCH 14/83] add catching of nanmean warning; fix InstanceMetric
 verification function

---
 src/unitxt/metrics.py | 53 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index dc69f62f83..4d200126c6 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -108,8 +108,6 @@ def score_based_confidence_interval(
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
-        # resample the indices of the instances, which contain the scores
-        identifiers = list(range(len(instances)))
 
         if score_names is None:
             score_names = (
@@ -125,18 +123,12 @@ def aggregation_func(instances, field_name):
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
-                def metric(instances):
-                    try:
-                        return aggregation_func(instances, score_name)
-                    except Exception as e:
-                        # this happens in edge cases, for example, when the sampling creates a
-                        # sample where all strings are empty and this fails bleu.
-                        logger.info(f"Warning in {self.__class__.__name__}", e)
-                        return np.nan
+                # def metric(instances):
+                #     return aggregation_func(instances, score_name)
 
                 scores = numpy.apply_along_axis(
-                    lambda x: metric(
-                        [instances[ii] for ii in x],
+                    lambda resamled_instances: aggregation_func(
+                        resamled_instances, score_name
                     ),
                     axis=axis,
                     arr=arr,
@@ -146,7 +138,7 @@ def metric(instances):
 
             # apply bootstrap only on the relevant field
             ci = bootstrap(
-                (identifiers,),
+                (instances,),
                 statistic=statistic,
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
@@ -455,13 +447,6 @@ def _validate_group_mean_reduction(self):
             if "score_fields" in fields:
                 assert isinstance(fields["score_fields"], list)
 
-            return (
-                [self.main_score]
-                if "score_fields" not in fields
-                else fields["score_fields"]
-            )
-        return [self.main_score]
-
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream, stream_name)
 
@@ -475,7 +460,12 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 aggregation_func = self.aggregate
 
             if reduction == "group_mean":
-                score_fields = self._validate_group_mean_reduction()
+                self._validate_group_mean_reduction()
+                score_fields = (
+                    [self.main_score]
+                    if "score_fields" not in fields
+                    else fields["score_fields"]
+                )
 
                 def aggregation_func(
                     instances, field_name, field=fields["agg_func"][1]
@@ -546,7 +536,12 @@ def compute_instance_scores(
     @staticmethod
     def aggregate(instances, field_name):
         scores = [instance["score"]["instance"][field_name] for instance in instances]
-        return np.nanmean(scores)
+        import warnings
+
+        with warnings.catch_warnings():
+            # in case instances is empty, return NaN but avoid printing a RuntimeWarning
+            warnings.simplefilter("ignore", category=RuntimeWarning)
+            return np.nanmean(scores)
 
     def grouped_aggregate(self, instances, field_name, aggregation_func):
         from collections import defaultdict
@@ -569,11 +564,15 @@ def grouped_aggregate(self, instances, field_name, aggregation_func):
         group_total_scores = [
             aggregation_func(scores) for scores in group_to_instance_scores.values()
         ]
-        group_total_scores = [
-            score for score in group_total_scores if not np.isnan(score)
-        ]
-        # ignore NaNs in aggregation
-        return mean(group_total_scores) if len(group_total_scores) else np.nan
+        import warnings
+
+        with warnings.catch_warnings():
+            # final mean should be mean of group_total_score, ignoring NaN, hence nanmean
+            # but if the group function values is NaN for ALL groups, nanmean throws a
+            # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
+            # this is the desired behavior, but we want to avoid the warning here
+            warnings.simplefilter("ignore", category=RuntimeWarning)
+            return np.nanmean(group_total_scores)
 
     @abstractmethod
     def compute(

From 8f5ce1031f113075bad6e1fcaeaef5c3fef2ec2a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 16 Jan 2024 14:47:14 +0200
Subject: [PATCH 15/83] InstanceMetric need to specify ci_scores for fields
 that have calculated CIs. score_based_confidence_interval accepts list of
 score fields without definining bootstrap function

---
 src/unitxt/metrics.py | 135 ++++++++++++++++++++++--------------------
 1 file changed, 72 insertions(+), 63 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 4286a36a1f..ebfe065199 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -85,9 +85,9 @@ def _can_compute_confidence_intervals(self, num_predictions):
 
     def score_based_confidence_interval(
         self,
-        instances,
+        instances: List[dict],
+        score_names: List[str],
         aggregation_func=None,
-        score_names: Optional[List[str]] = None,
         func_name="",
     ):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
@@ -95,26 +95,25 @@ def score_based_confidence_interval(
         Unlike GlobalMetric, this is simply a function of the instance scores (possibly taking into account additional_inputs field),
          so they don't need to be recomputed after every bootstrap draw.
 
+        instances:
+            The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
         score_names: List[str]
             Compute a confidence interval for each score_name from this list.
-        instances:
-            The instances for which the confidence intervals are computed.
         aggregation_func:
             A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
             field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
             instances after resampling, if aggregation_func=None.
+        func_name:
+            An optional function name (if aggregation_func is not the mean) to append to each score_name in the results.
+            Used primarily for group_mean reductions.
         """
         result = {}
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
-
-        if score_names is None:
-            score_names = (
-                self.ci_scores if self.ci_scores is not None else [self.main_score]
-            )
+        func_name = str(func_name)
         if aggregation_func is None:
-
+            # by default mean aggregation
             def aggregation_func(instances, field_name):
                 return mean([instance[field_name] for instance in instances])
 
@@ -123,12 +122,9 @@ def aggregation_func(instances, field_name):
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
-                # def metric(instances):
-                #     return aggregation_func(instances, score_name)
-
                 scores = numpy.apply_along_axis(
-                    lambda resamled_instances: aggregation_func(
-                        resamled_instances, score_name
+                    lambda resampled_instances: aggregation_func(
+                        resampled_instances, score_name
                     ),
                     axis=axis,
                     arr=arr,
@@ -145,9 +141,7 @@ def statistic(arr, axis, score_name=score_name):
                 random_state=self.new_random_generator(),
             ).confidence_interval
             full_score_name = (
-                score_name
-                if len(func_name) == 0
-                else "_".join([str(func_name), score_name])
+                score_name if len(func_name) == 0 else func_name + "_" + score_name
             )
             result[f"{full_score_name}_ci_low"] = ci.low
             result[f"{full_score_name}_ci_high"] = ci.high
@@ -157,6 +151,7 @@ def statistic(arr, axis, score_name=score_name):
         return result
 
     def resample_from_non_nan(self, values):
+        """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones."""
         if values.size > 1:
             error_indices = numpy.isnan(values)
             n_errors = sum(error_indices)
@@ -393,8 +388,13 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                         global_score["score"] = global_score[field_name]
                         global_score["score_name"] = self.main_score
 
+                ci_fields = (
+                    list(set(self.ci_scores))
+                    if self.ci_scores is not None
+                    else [self.main_score]
+                )
                 confidence_interval = self.score_based_confidence_interval(
-                    instances=instances
+                    instances=instances, score_names=ci_fields
                 )
                 global_score.update(confidence_interval)
 
@@ -428,77 +428,81 @@ def reduction_map(self) -> dict:
         pass
 
     def _validate_group_mean_reduction(self):
-        if "group_mean" in self.reduction_map:
-            # for group_mean, expects a dict
-            fields = self.reduction_map["group_mean"]
-            if not self.grouping_field:
-                raise ValueError(
-                    "self.grouping_field is None, . "
-                    "This field is required for group based metric computation."
-                )
+        if not self.grouping_field:
+            raise ValueError(
+                "self.grouping_field is None, . "
+                "This field is required for group based metric computation."
+            )
 
-            assert isinstance(fields, dict)
-            assert (
-                "agg_func" in fields
-            ), "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
-            assert callable(
-                fields["agg_func"][1]
-            ), "second item in fields['agg_func'] should be a callable function"
-            if "score_fields" in fields:
-                assert isinstance(fields["score_fields"], list)
+        assert (
+            "group_mean" in self.reduction_map
+        ), "reduction_map must have a `group_mean' key"
+        fields = self.reduction_map["group_mean"]
+        # for group_mean, expects a dict
+        assert isinstance(fields, dict)
+        assert (
+            "agg_func" in fields
+        ), "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
+        assert callable(
+            fields["agg_func"][1]
+        ), "second item in fields['agg_func'] should be a callable function"
+        if "score_fields" in fields:
+            assert isinstance(fields["score_fields"], list)
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream, stream_name)
 
-        for reduction, fields in self.reduction_map.items():
+        for reduction_type, reduction_params in self.reduction_map.items():
             assert (
-                reduction in self.implemented_reductions
-            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
-
+                reduction_type in self.implemented_reductions
+            ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
             aggregation_func = None
-            if reduction == "mean":
-                aggregation_func = self.aggregate
 
-            if reduction == "group_mean":
+            if reduction_type == "mean":
+                aggregation_func = self.aggregate_instance_scores
+                reduction_fields = list(set(reduction_params))
+                field_name_full_prefix = ""
+            elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction()
-                score_fields = (
+                reduction_fields = (
                     [self.main_score]
-                    if "score_fields" not in fields
-                    else fields["score_fields"]
+                    if "score_fields" not in reduction_params
+                    else list(set(reduction_params["score_fields"]))
                 )
+                aggregation_function_name = str(reduction_params["agg_func"][0])
+                field_name_full_prefix = "group_" + aggregation_function_name
 
                 def aggregation_func(
-                    instances, field_name, field=fields["agg_func"][1]
+                    instances, field_name, field=reduction_params["agg_func"][1]
                 ):
-                    return self.grouped_aggregate(instances, field_name, field)
+                    return self.aggregate_instance_scores_by_group(
+                        instances, field_name, field
+                    )
 
             if not aggregation_func:
                 raise ValueError(
-                    f"No aggregation_func was defined for reduction {reduction}. "
+                    f"No aggregation_func was defined for reduction {reduction_type}. "
                     f"Please specify a valid reduction method in reduction_map {self.reduction_map}."
                 )
 
-            for field_name in score_fields if reduction == "group_mean" else fields:
-                if reduction == "group_mean":
-                    field_name_full_prefix = "group_" + str(fields["agg_func"][0])
-                    field_name_full = "_".join([field_name_full_prefix, field_name])
+            # calculate global scores for each reduction field
+            for field_name in reduction_fields:
+                if reduction_type == "group_mean":
+                    field_name_full = field_name_full_prefix + "_" + field_name
                 else:
-                    field_name_full_prefix = ""
                     field_name_full = field_name
                 global_score[field_name_full] = aggregation_func(instances, field_name)
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
                     global_score["score_name"] = field_name_full
 
-                def bootstrap_aggregation_func(
-                    instances, field_name, agg_func=aggregation_func
-                ):
-                    return agg_func(instances=instances, field_name=field_name)
-
+            # need to specify which fields should have CIs calculated for them through ci_scores
+            # (will not automatically calculate CIs for fields in reduction map)
+            if self.ci_scores is not None:
                 confidence_interval = self.score_based_confidence_interval(
                     instances=instances,
-                    aggregation_func=bootstrap_aggregation_func,
-                    score_names=[field_name],
+                    aggregation_func=aggregation_func,
+                    score_names=list(set(self.ci_scores)),
                     func_name=field_name_full_prefix,
                 )
                 global_score.update(confidence_interval)
@@ -534,7 +538,7 @@ def compute_instance_scores(
         return instances, global_score
 
     @staticmethod
-    def aggregate(instances, field_name):
+    def aggregate_instance_scores(instances, field_name):
         scores = [instance["score"]["instance"][field_name] for instance in instances]
         import warnings
 
@@ -543,7 +547,9 @@ def aggregate(instances, field_name):
             warnings.simplefilter("ignore", category=RuntimeWarning)
             return np.nanmean(scores)
 
-    def grouped_aggregate(self, instances, field_name, aggregation_func):
+    def aggregate_instance_scores_by_group(
+        self, instances, field_name, aggregation_func
+    ):
         from collections import defaultdict
 
         group_to_instance_scores = defaultdict(list)
@@ -615,6 +621,7 @@ def compute(
 class Accuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy"]}
     main_score = "accuracy"
+    ci_scores = ["accuracy"]
 
     def compute(
         self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
@@ -632,6 +639,7 @@ def compute(
 class StringContainment(InstanceMetric):
     reduction_map = {"mean": ["string_containment"]}
     main_score = "string_containment"
+    ci_scores = ["string_containment"]
 
     def compute(
         self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
@@ -1062,6 +1070,7 @@ def compute(self, references, predictions, additional_inputs: List[Dict]):
 class CharEditDistanceAccuracy(InstanceMetric):
     reduction_map = {"mean": ["char_edit_dist_accuracy"]}
     main_score = "char_edit_dist_accuracy"
+    ci_scores = ["char_edit_dist_accuracy"]
 
     def prepare(self):
         super().prepare()

From 55e559de2163539f51b1ff9cb81fd69702ebbfd0 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 16 Jan 2024 21:17:31 +0200
Subject: [PATCH 16/83] add ci_scores to several InstanceMetrics move
 aggregate_instance_scores as static method to MetricWithConfidenceInterval so
 can be used in score_based_confidence_interval

---
 src/unitxt/metrics.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 319f781ce0..802086e3e3 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -83,6 +83,17 @@ def _can_compute_confidence_intervals(self, num_predictions):
             and num_predictions > 1
         )
 
+    @staticmethod
+    def aggregate_instance_scores(instances, field_name):
+        """Calculate mean of a set of instance scores (given by field_name)"""
+        scores = [instance["score"]["instance"][field_name] for instance in instances]
+        import warnings
+
+        with warnings.catch_warnings():
+            # in case instances is empty, return NaN but avoid printing a RuntimeWarning
+            warnings.simplefilter("ignore", category=RuntimeWarning)
+            return np.nanmean(scores)
+
     def score_based_confidence_interval(
         self,
         instances: List[dict],
@@ -114,8 +125,7 @@ def score_based_confidence_interval(
         func_name = str(func_name)
         if aggregation_func is None:
             # by default mean aggregation
-            def aggregation_func(instances, field_name):
-                return mean([instance[field_name] for instance in instances])
+            aggregation_func = self.aggregate_instance_scores
 
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
@@ -537,15 +547,6 @@ def compute_instance_scores(
 
         return instances, global_score
 
-    @staticmethod
-    def aggregate_instance_scores(instances, field_name):
-        scores = [instance["score"]["instance"][field_name] for instance in instances]
-        import warnings
-
-        with warnings.catch_warnings():
-            # in case instances is empty, return NaN but avoid printing a RuntimeWarning
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            return np.nanmean(scores)
 
     def aggregate_instance_scores_by_group(
         self, instances, field_name, aggregation_func
@@ -1793,6 +1794,7 @@ def _compute(
 class MRR(RetrievalMetric):
     reduction_map = {"mean": ["mrr"]}
     main_score = "mrr"
+    ci_scores = ["mrr"]
 
     def _compute(
         self,
@@ -1809,6 +1811,7 @@ def _compute(
 class MAP(RetrievalMetric):
     reduction_map = {"mean": ["map"]}
     main_score = "map"
+    ci_scores = ["map"]
 
     def _compute(
         self,

From 7047fd746b83d726433c3063df13cbb15e2acf51 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 16 Jan 2024 21:37:15 +0200
Subject: [PATCH 17/83] ruff formatting

---
 src/unitxt/metrics.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 802086e3e3..69478d71d2 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -85,7 +85,12 @@ def _can_compute_confidence_intervals(self, num_predictions):
 
     @staticmethod
     def aggregate_instance_scores(instances, field_name):
-        """Calculate mean of a set of instance scores (given by field_name)"""
+        """Calculate mean of a set of instance scores (given by field_name), ignoring NaNs.
+
+        Args:
+            instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
+            field_name: score field names to compute mean for.
+        """
         scores = [instance["score"]["instance"][field_name] for instance in instances]
         import warnings
 
@@ -106,17 +111,17 @@ def score_based_confidence_interval(
         Unlike GlobalMetric, this is simply a function of the instance scores (possibly taking into account additional_inputs field),
          so they don't need to be recomputed after every bootstrap draw.
 
-        instances:
-            The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
-        score_names: List[str]
-            Compute a confidence interval for each score_name from this list.
-        aggregation_func:
-            A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
-            field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
-            instances after resampling, if aggregation_func=None.
-        func_name:
-            An optional function name (if aggregation_func is not the mean) to append to each score_name in the results.
-            Used primarily for group_mean reductions.
+        Args:
+            instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
+            score_names: List of instance score field names to compute a confidence interval for.
+            aggregation_func: A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
+                field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
+                instances after resampling, if aggregation_func=None.
+            func_name: An optional function name (if aggregation_func is not the mean) to append to each score_name in the results.
+                Used primarily for group_mean reductions.
+
+        Returns:
+            Dict of confidence interval values
         """
         result = {}
 
@@ -547,7 +552,6 @@ def compute_instance_scores(
 
         return instances, global_score
 
-
     def aggregate_instance_scores_by_group(
         self, instances, field_name, aggregation_func
     ):

From 67e05e4a774755439dc5f7e5ba98f75b0f713a6a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 17 Jan 2024 11:25:37 +0200
Subject: [PATCH 18/83] add test_grouped_instance_metric_errors for code
 coverage

---
 tests/test_metrics.py | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 488c44ced3..320b7aaf93 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -514,6 +514,85 @@ def test_grouped_instance_metrics(self):
                 msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
             )
 
+    def test_grouped_instance_metric_errors(self):
+        """Test certain value and assertion error raises for grouped instance metrics (with group_mean reduction)."""
+        from statistics import mean
+
+        class NoGroupField(Accuracy):
+            reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+
+        with self.assertRaises(ValueError):
+            # should raise error because no grouping_field
+            metric = NoGroupField()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
+        from dataclasses import field
+        from typing import List
+
+        class NoAggFuncReduction(Accuracy):
+            implemented_reductions: List[str] = field(
+                default_factory=lambda: ["mean", "group_mean", "some_other_func"]
+            )
+            grouping_field = "group_id"
+            reduction_map = {"some_other_func": {"agg_func": ["mean", mean]}}
+
+        with self.assertRaises(ValueError):
+            # should raise error because no aggregation_function will be defined, since only mean and group_mean are implemented
+            metric = NoAggFuncReduction()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
+        class NoAggFunc(Accuracy):
+            grouping_field = "group_id"
+            reduction_map = {"group_mean": {"func": ["mean", mean]}}
+
+        with self.assertRaises(AssertionError):
+            # should raise error because no "agg_func" field in group_mean
+            metric = NoAggFunc()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
+        class NoCallableAggFunc(Accuracy):
+            grouping_field = "group_id"
+            reduction_map = {"group_mean": {"agg_func": ["mean", "some string"]}}
+
+        with self.assertRaises(AssertionError):
+            # should raise error because second field of agg_func should be callable
+            metric = NoCallableAggFunc()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
+        class WrongGroupID(Accuracy):
+            grouping_field = "random_id_name"
+            reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+
+        with self.assertRaises(ValueError):
+            # should raise error because grouping_field is not found in the additional inputs
+            metric = WrongGroupID()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
 
 class TestConfidenceIntervals(unittest.TestCase):
     def test_confidence_interval_off(self):

From 4cc38cb27fdcd8e5d5f5567ec1b3c033d0dcce3d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 17 Jan 2024 22:17:51 +0200
Subject: [PATCH 19/83] add grouped instance metrics with normalized Cohen's h
 aggregation function

---
 prepare/metrics/grouped_instance_metrics.py | 50 +++++++++++++++++++++
 src/unitxt/metrics.py                       | 45 +++++++++++++++++++
 tests/test_metrics.py                       | 25 ++++++++++-
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index e83c44f149..ea764c536f 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -3,6 +3,8 @@
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
+    GroupNormCohensHAccuracy,
+    GroupNormCohensHStringContainment,
     GroupPDRAccuracy,
     GroupPDRStringContainment,
 )
@@ -224,6 +226,54 @@
 
 add_to_catalog(metric, "metrics.group_pdr_string_containment", overwrite=True)
 
+# Try Cohen's h instead of PDR since is symmetric and defined when baseline is 0
+metric = GroupNormCohensHAccuracy()
+global_target = {
+    "group_norm_cohens_h_accuracy": -0.42,
+    "score": -0.42,
+    "score_name": "group_norm_cohens_h_accuracy",
+    "score_ci_low": -0.92,
+    "score_ci_high": -0.33,
+    "group_norm_cohens_h_accuracy_ci_low": -0.92,
+    "group_norm_cohens_h_accuracy_ci_high": -0.33,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_norm_cohens_h_accuracy", overwrite=True)
+
+
+metric = GroupNormCohensHStringContainment()
+global_target = {
+    "group_norm_cohens_h_string_containment": -0.46,
+    "score": -0.46,
+    "score_name": "group_norm_cohens_h_string_containment",
+    "score_ci_low": -0.74,
+    "score_ci_high": -0.39,
+    "group_norm_cohens_h_string_containment_ci_low": -0.74,
+    "group_norm_cohens_h_string_containment_ci_high": -0.39,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_norm_cohens_h_string_containment", overwrite=True)
+
 
 # create references and predictions with only 3 unique values
 short_predictions = [
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 69478d71d2..93f63850aa 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -1909,6 +1909,41 @@ def performance_drop_rate(instance_scores: List):
     )
 
 
+def normalized_cohens_h(instance_scores: List):
+    """Cohen's h between two proportions.
+
+    Allows for change-type metric when the baseline is 0 (percentage change is undefined)
+    https://en.wikipedia.org/wiki/Cohen%27s_h
+
+    Cohen's h effect size metric between two proportions p2 and p1 is 2 * (arcsin(sqrt(p2)) - arcsin(sqrt(p1))).
+    h in -pi, pi, with +/- representing the largest increase/decrease (p1=0, p2=1), or (p1=1, p2=0).
+    h=0 is no change. Unlike percentage change, h is defined if the baseline (p1) is 0.
+    Assumes the scores are in [0,1], either continuous or binary.
+    For scores in a list, the first element is treated as the baseline p1, and the mean of the others
+    as p2, and evaluates p2 change relative to p1.  It is thus undefined if the list is of length < 2.
+    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
+
+    Args:
+        instance_scores: a list of scores on instances.  Assume the first element is the original, the others are test set
+
+    Returns:
+        float score between -1 and 1
+    """
+    assert isinstance(instance_scores, list)
+    assert all(
+        0 <= score <= 1 for score in instance_scores
+    ), "all scores must be in [0,1]"
+
+    if len(instance_scores) < 2:
+        # needs at least 2 elements
+        return np.nan
+    # assumes first element is the baseline proportion
+    baseline_p = instance_scores[0]
+    new_p = np.nanmean(instance_scores[1:])
+    h = 2 * (np.arcsin(np.sqrt(new_p)) - np.arcsin(np.sqrt(baseline_p)))
+    return np.clip(a=h / np.pi, a_min=-1, a_max=1)
+
+
 class GroupMeanAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
@@ -1937,3 +1972,13 @@ class GroupMeanTokenOverlap(TokenOverlap):
             "score_fields": ["f1", "precision", "recall"],
         }
     }
+
+
+class GroupNormCohensHAccuracy(Accuracy):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["norm_cohens_h", normalized_cohens_h]}}
+
+
+class GroupNormCohensHStringContainment(StringContainment):
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["norm_cohens_h", normalized_cohens_h]}}
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 320b7aaf93..f4069890a6 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -12,6 +12,8 @@
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
+    GroupNormCohensHAccuracy,
+    GroupNormCohensHStringContainment,
     GroupPDRAccuracy,
     GroupPDRStringContainment,
     Rouge,
@@ -484,8 +486,17 @@ def test_grouped_instance_metrics(self):
             GroupMeanStringContainment(),
             GroupPDRAccuracy(),
             GroupPDRStringContainment(),
+            GroupNormCohensHAccuracy(),
+            GroupNormCohensHStringContainment(),
+        ]
+        global_targets = [
+            0.225,
+            0.4875,
+            0.8333333333333334,
+            0.4444444444444445,
+            -0.4249467048786864,
+            -0.4639421840102023,
         ]
-        global_targets = [0.225, 0.4875, 0.8333333333333334, 0.4444444444444445]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
                 metric=metric,
@@ -710,6 +721,18 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=1.0,
         )
 
+        self._test_grouped_instance_confidence_interval(
+            metric=GroupNormCohensHAccuracy(),
+            expected_ci_low=-0.9232678869571689,
+            expected_ci_high=-0.3333333333333333,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=GroupNormCohensHStringContainment(),
+            expected_ci_low=-0.743586957620825,
+            expected_ci_high=-0.3916963890211997,
+        )
+
         # F1-based scores
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanTokenOverlap(),

From 3631171a3b1d97538040ba0b06abbb8da3ed7f58 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 17 Jan 2024 22:21:55 +0200
Subject: [PATCH 20/83] add normalized Cohen's h

---
 src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json   | 3 +++
 .../metrics/group_norm_cohens_h_string_containment.json        | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json

diff --git a/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json b/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
new file mode 100644
index 0000000000..860926ed3c
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_norm_cohens_h_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json b/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json
new file mode 100644
index 0000000000..7fcc79b0e3
--- /dev/null
+++ b/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_norm_cohens_h_string_containment"
+}

From f7382027521d58d30dc3be3ce1ac5dfa18063a8f Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 17 Jan 2024 22:22:10 +0200
Subject: [PATCH 21/83] marge with main

---
 src/unitxt/catalog/formats/user_agent.json    |  2 +-
 .../instructions/models/japanese_llama.json   |  4 --
 .../catalog/instructions/models/llama.json    |  2 +-
 .../metrics/token_overlap_with_context.json   | 46 +------------------
 src/unitxt/catalog/tasks/qa/open.json         |  2 +-
 5 files changed, 5 insertions(+), 51 deletions(-)

diff --git a/src/unitxt/catalog/formats/user_agent.json b/src/unitxt/catalog/formats/user_agent.json
index f1884edcb7..7eb488018f 100644
--- a/src/unitxt/catalog/formats/user_agent.json
+++ b/src/unitxt/catalog/formats/user_agent.json
@@ -1,5 +1,5 @@
 {
     "type": "system_format",
-    "demo_format": "User:{source}\nAgent:{target}\n\n",
+    "demo_format": "User: {source}\nAgent: {target}\n\n",
     "model_input_format": "{instruction}\n{demos}\nUser:{source}\nAgent:"
 }
diff --git a/src/unitxt/catalog/instructions/models/japanese_llama.json b/src/unitxt/catalog/instructions/models/japanese_llama.json
index 063d98b10e..e69de29bb2 100644
--- a/src/unitxt/catalog/instructions/models/japanese_llama.json
+++ b/src/unitxt/catalog/instructions/models/japanese_llama.json
@@ -1,4 +0,0 @@
-{
-    "type": "textual_instruction",
-    "text": "<<SYS>>\nあなたは誠実で優秀な日本人のアシスタントです。\n<</SYS>>\n\n"
-}
diff --git a/src/unitxt/catalog/instructions/models/llama.json b/src/unitxt/catalog/instructions/models/llama.json
index 3be6ec6c1f..8ce97e5466 100644
--- a/src/unitxt/catalog/instructions/models/llama.json
+++ b/src/unitxt/catalog/instructions/models/llama.json
@@ -1,4 +1,4 @@
 {
     "type": "textual_instruction",
-    "text": "<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to aquestion, please don’t share false information.\n<</SYS>>\n\n\n\n"
+    "text": "<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don�t know the answer to aquestion, please don�t share false information.\n<</SYS>>\n\n\n\n"
 }
diff --git a/src/unitxt/catalog/metrics/token_overlap_with_context.json b/src/unitxt/catalog/metrics/token_overlap_with_context.json
index 0670f7ba6e..e56ed950a5 100644
--- a/src/unitxt/catalog/metrics/token_overlap_with_context.json
+++ b/src/unitxt/catalog/metrics/token_overlap_with_context.json
@@ -1,46 +1,4 @@
 {
-    "type": "metric_pipeline",
-    "main_score": "score",
-    "preprocess_steps": [
-        {
-            "type": "copy_fields",
-            "field_to_field": [
-                [
-                    "additional_inputs/context",
-                    "references"
-                ]
-            ],
-            "use_query": true
-        },
-        {
-            "type": "list_field_values",
-            "fields": [
-                "references"
-            ],
-            "to_field": "references"
-        }
-    ],
-    "metric": {
-        "type": "token_overlap"
-    },
-    "postpreprocess_steps": [
-        {
-            "type": "copy_fields",
-            "field_to_field": [
-                [
-                    "score/global/f1",
-                    "score/global/f1_overlap_with_context"
-                ],
-                [
-                    "score/global/recall",
-                    "score/global/recall_overlap_with_context"
-                ],
-                [
-                    "score/global/precision",
-                    "score/global/precision_overlap_with_context"
-                ]
-            ],
-            "use_query": true
-        }
-    ]
+    "type": "reward",
+    "model_name": "OpenAssistant/reward-model-deberta-v3-large-v2"
 }
diff --git a/src/unitxt/catalog/tasks/qa/open.json b/src/unitxt/catalog/tasks/qa/open.json
index fee0976f79..5e897df9df 100644
--- a/src/unitxt/catalog/tasks/qa/open.json
+++ b/src/unitxt/catalog/tasks/qa/open.json
@@ -7,6 +7,6 @@
         "answer"
     ],
     "metrics": [
-        "metrics.squad"
+        "metrics.rouge"
     ]
 }

From dd6bcfe0f7f45b2cb29b50b4d35f202c68c3f6a4 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 18 Jan 2024 13:20:31 +0200
Subject: [PATCH 22/83] change description of group_instance_metrics test since
 is no longer inheriting from GroupMetric

---
 tests/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index f4069890a6..a5bccf028e 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -696,7 +696,7 @@ def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high):
                 )
 
     def test_grouped_instance_metric_confidence_interval(self):
-        """Test the calculation of confidence intervals for grouped instance metrics (a subclass of global metrics)."""
+        """Test the calculation of confidence intervals for grouped instance metrics (sub-types of InstanceMetric with group_mean reduction)."""
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanAccuracy(),
             expected_ci_low=0.025,

From 2ce1067ffa3130a951fb1c2c6f50ee6b053026fa Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 18 Jan 2024 15:37:59 +0200
Subject: [PATCH 23/83] checkout from main

---
 src/unitxt/catalog/formats/user_agent.json    |  2 +-
 .../instructions/models/japanese_llama.json   |  4 ++
 .../catalog/instructions/models/llama.json    |  2 +-
 .../metrics/token_overlap_with_context.json   | 46 ++++++++++++++++++-
 src/unitxt/catalog/tasks/qa/open.json         |  2 +-
 src/unitxt/version.py                         |  2 +-
 6 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/src/unitxt/catalog/formats/user_agent.json b/src/unitxt/catalog/formats/user_agent.json
index 7eb488018f..f1884edcb7 100644
--- a/src/unitxt/catalog/formats/user_agent.json
+++ b/src/unitxt/catalog/formats/user_agent.json
@@ -1,5 +1,5 @@
 {
     "type": "system_format",
-    "demo_format": "User: {source}\nAgent: {target}\n\n",
+    "demo_format": "User:{source}\nAgent:{target}\n\n",
     "model_input_format": "{instruction}\n{demos}\nUser:{source}\nAgent:"
 }
diff --git a/src/unitxt/catalog/instructions/models/japanese_llama.json b/src/unitxt/catalog/instructions/models/japanese_llama.json
index e69de29bb2..063d98b10e 100644
--- a/src/unitxt/catalog/instructions/models/japanese_llama.json
+++ b/src/unitxt/catalog/instructions/models/japanese_llama.json
@@ -0,0 +1,4 @@
+{
+    "type": "textual_instruction",
+    "text": "<<SYS>>\nあなたは誠実で優秀な日本人のアシスタントです。\n<</SYS>>\n\n"
+}
diff --git a/src/unitxt/catalog/instructions/models/llama.json b/src/unitxt/catalog/instructions/models/llama.json
index 8ce97e5466..3be6ec6c1f 100644
--- a/src/unitxt/catalog/instructions/models/llama.json
+++ b/src/unitxt/catalog/instructions/models/llama.json
@@ -1,4 +1,4 @@
 {
     "type": "textual_instruction",
-    "text": "<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don�t know the answer to aquestion, please don�t share false information.\n<</SYS>>\n\n\n\n"
+    "text": "<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to aquestion, please don’t share false information.\n<</SYS>>\n\n\n\n"
 }
diff --git a/src/unitxt/catalog/metrics/token_overlap_with_context.json b/src/unitxt/catalog/metrics/token_overlap_with_context.json
index e56ed950a5..0670f7ba6e 100644
--- a/src/unitxt/catalog/metrics/token_overlap_with_context.json
+++ b/src/unitxt/catalog/metrics/token_overlap_with_context.json
@@ -1,4 +1,46 @@
 {
-    "type": "reward",
-    "model_name": "OpenAssistant/reward-model-deberta-v3-large-v2"
+    "type": "metric_pipeline",
+    "main_score": "score",
+    "preprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "additional_inputs/context",
+                    "references"
+                ]
+            ],
+            "use_query": true
+        },
+        {
+            "type": "list_field_values",
+            "fields": [
+                "references"
+            ],
+            "to_field": "references"
+        }
+    ],
+    "metric": {
+        "type": "token_overlap"
+    },
+    "postpreprocess_steps": [
+        {
+            "type": "copy_fields",
+            "field_to_field": [
+                [
+                    "score/global/f1",
+                    "score/global/f1_overlap_with_context"
+                ],
+                [
+                    "score/global/recall",
+                    "score/global/recall_overlap_with_context"
+                ],
+                [
+                    "score/global/precision",
+                    "score/global/precision_overlap_with_context"
+                ]
+            ],
+            "use_query": true
+        }
+    ]
 }
diff --git a/src/unitxt/catalog/tasks/qa/open.json b/src/unitxt/catalog/tasks/qa/open.json
index 5e897df9df..fee0976f79 100644
--- a/src/unitxt/catalog/tasks/qa/open.json
+++ b/src/unitxt/catalog/tasks/qa/open.json
@@ -7,6 +7,6 @@
         "answer"
     ],
     "metrics": [
-        "metrics.rouge"
+        "metrics.squad"
     ]
 }
diff --git a/src/unitxt/version.py b/src/unitxt/version.py
index 88a127b422..de98a45b67 100644
--- a/src/unitxt/version.py
+++ b/src/unitxt/version.py
@@ -1 +1 @@
-version = "1.4.6"
\ No newline at end of file
+version = "1.4.6"

From 2664de26ac2e28a6c706cc603f85c67c5d579786 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 21 Jan 2024 17:41:57 +0200
Subject: [PATCH 24/83] slight difference in results for confidence interval
 between Travis and local for Cohen's H

---
 tests/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index a5bccf028e..2a2b24c8bd 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -730,7 +730,7 @@ def test_grouped_instance_metric_confidence_interval(self):
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHStringContainment(),
             expected_ci_low=-0.743586957620825,
-            expected_ci_high=-0.3916963890211997,
+            expected_ci_high=-0.3908330554711398,
         )
 
         # F1-based scores

From b6ed90e019eb30a4aec512986471c94aeae1915d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 21 Jan 2024 18:34:45 +0200
Subject: [PATCH 25/83] add note for grouped instance CI for Cohen +
 StringContainment

---
 tests/test_metrics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 2a2b24c8bd..6850fc2a29 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -727,6 +727,8 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=-0.3333333333333333,
         )
 
+        # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results
+        # hence this test may fail on a PC
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHStringContainment(),
             expected_ci_low=-0.743586957620825,

From 16a17cceb071ef490ed4229bee42d6868a561e91 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 21 Jan 2024 18:43:06 +0200
Subject: [PATCH 26/83] add documentation to InstanceMetric group_mean
 reduction validation rename aggregate_instance_scores with
 average_instance_scores add _ directly to ci prefix

---
 src/unitxt/metrics.py | 81 +++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 93f63850aa..0892fd8c4a 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -84,27 +84,23 @@ def _can_compute_confidence_intervals(self, num_predictions):
         )
 
     @staticmethod
-    def aggregate_instance_scores(instances, field_name):
-        """Calculate mean of a set of instance scores (given by field_name), ignoring NaNs.
+    def average_instance_scores(instances, field_name):
+        """Calculate mean of a set of instance scores (given by field_name).
 
         Args:
             instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
             field_name: score field names to compute mean for.
         """
-        scores = [instance["score"]["instance"][field_name] for instance in instances]
-        import warnings
-
-        with warnings.catch_warnings():
-            # in case instances is empty, return NaN but avoid printing a RuntimeWarning
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            return np.nanmean(scores)
+        return mean(
+            [instance["score"]["instance"][field_name] for instance in instances]
+        )
 
     def score_based_confidence_interval(
         self,
         instances: List[dict],
         score_names: List[str],
         aggregation_func=None,
-        func_name="",
+        ci_score_prefix="",
     ):
         """Compute confidence intervals based on existing scores, already computed on the input instances.
 
@@ -117,8 +113,8 @@ def score_based_confidence_interval(
             aggregation_func: A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
                 field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
                 instances after resampling, if aggregation_func=None.
-            func_name: An optional function name (if aggregation_func is not the mean) to append to each score_name in the results.
-                Used primarily for group_mean reductions.
+            ci_score_prefix: An optional string prefix to the score_name in the CI.  Useful in cases where the
+                aggregation_func is something other than the mean
 
         Returns:
             Dict of confidence interval values
@@ -127,10 +123,10 @@ def score_based_confidence_interval(
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
-        func_name = str(func_name)
+        ci_score_prefix = str(ci_score_prefix)
         if aggregation_func is None:
             # by default mean aggregation
-            aggregation_func = self.aggregate_instance_scores
+            aggregation_func = self.average_instance_scores
 
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
@@ -155,9 +151,7 @@ def statistic(arr, axis, score_name=score_name):
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
-            full_score_name = (
-                score_name if len(func_name) == 0 else func_name + "_" + score_name
-            )
+            full_score_name = ci_score_prefix + score_name
             result[f"{full_score_name}_ci_low"] = ci.low
             result[f"{full_score_name}_ci_high"] = ci.high
             if score_name == self.main_score:
@@ -433,8 +427,11 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         default_factory=lambda: ["mean", "group_mean"]
     )
 
-    # for grouped metrics: a field that contains the group id. None to disable grouping.
-    # Grouped metrics aggregate the instance score per group, and then average over group scores.
+    # InstanceMetric currently allows two reductions, 'mean', which calculates the mean of instance scores,'
+    # and 'group_mean', which first applies an aggregation function specified in the reduction_map
+    # to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
+    # of the group scores; if grouping_field is None, grouping is disabled.
+    # see _validate_group_mean_reduction for an example and proper formatting of the reduction_map
     grouping_field: str = None
 
     @property
@@ -443,6 +440,21 @@ def reduction_map(self) -> dict:
         pass
 
     def _validate_group_mean_reduction(self):
+        """Ensure that group_mean reduction_map is properly formatted.
+
+        Example: Apply the variance (np.var) to group Accuracy instance scores.  This class would be specified as follows:
+
+        class GroupVarianceAccuracy(Accuracy):
+            grouping_field = 'group_id'
+            reduction_map = {'group_mean': {'agg_func': ['variance', np.var]}}
+
+        reduction_map must be a dict with
+        - an 'agg_func' field with value being a 2-element list where
+            - 1st element is a string name of the aggregation function (used in naming the CI report)
+            - 2nd element is the callable aggregation function
+        - Optional: 'score_fields' key with list value containing the string names of fields to apply the aggregation to
+            - if not present, the parent class main_score is used.
+        """
         if not self.grouping_field:
             raise ValueError(
                 "self.grouping_field is None, . "
@@ -458,6 +470,15 @@ def _validate_group_mean_reduction(self):
         assert (
             "agg_func" in fields
         ), "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
+        assert isinstance(
+            fields["agg_func"], list
+        ), "fields['agg_func'] should be a list"
+        assert (
+            len(fields["agg_func"]) == 2
+        ), "fields['agg_func'] should be a two-element list"
+        assert isinstance(
+            fields["agg_func"][0], str
+        ), "first item in fields['agg_func'] should be a string name of a function"
         assert callable(
             fields["agg_func"][1]
         ), "second item in fields['agg_func'] should be a callable function"
@@ -465,18 +486,17 @@ def _validate_group_mean_reduction(self):
             assert isinstance(fields["score_fields"], list)
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        instances, global_score = self.compute_instance_scores(stream, stream_name)
+        instances, global_score = self.compute_instance_scores(stream)
 
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
                 reduction_type in self.implemented_reductions
             ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
-            aggregation_func = None
 
+            field_name_full_prefix = ""
             if reduction_type == "mean":
-                aggregation_func = self.aggregate_instance_scores
+                aggregation_func = self.average_instance_scores
                 reduction_fields = list(set(reduction_params))
-                field_name_full_prefix = ""
             elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction()
                 reduction_fields = (
@@ -485,7 +505,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                     else list(set(reduction_params["score_fields"]))
                 )
                 aggregation_function_name = str(reduction_params["agg_func"][0])
-                field_name_full_prefix = "group_" + aggregation_function_name
+                field_name_full_prefix = "group_" + aggregation_function_name + "_"
 
                 def aggregation_func(
                     instances, field_name, field=reduction_params["agg_func"][1]
@@ -493,19 +513,14 @@ def aggregation_func(
                     return self.aggregate_instance_scores_by_group(
                         instances, field_name, field
                     )
-
-            if not aggregation_func:
+            else:
                 raise ValueError(
-                    f"No aggregation_func was defined for reduction {reduction_type}. "
-                    f"Please specify a valid reduction method in reduction_map {self.reduction_map}."
+                    f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
                 )
 
             # calculate global scores for each reduction field
             for field_name in reduction_fields:
-                if reduction_type == "group_mean":
-                    field_name_full = field_name_full_prefix + "_" + field_name
-                else:
-                    field_name_full = field_name
+                field_name_full = field_name_full_prefix + field_name
                 global_score[field_name_full] = aggregation_func(instances, field_name)
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
@@ -518,7 +533,7 @@ def aggregation_func(
                     instances=instances,
                     aggregation_func=aggregation_func,
                     score_names=list(set(self.ci_scores)),
-                    func_name=field_name_full_prefix,
+                    ci_score_prefix=field_name_full_prefix,
                 )
                 global_score.update(confidence_interval)
 

From 83104db3d1e5fb0b2516895994faf448457dc2c1 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 09:23:54 +0200
Subject: [PATCH 27/83] rename field as group_aggregation_func; use
 resample_from_non_nan in globalmetric confidence interval to ensure scores
 are not NaN

---
 src/unitxt/metrics.py | 48 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 0892fd8c4a..6b5e085c22 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -112,7 +112,7 @@ def score_based_confidence_interval(
             score_names: List of instance score field names to compute a confidence interval for.
             aggregation_func: A function with arguments instances, field_name; is applied on list of instances (which may include additional_inputs
                 field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
-                instances after resampling, if aggregation_func=None.
+                instances after resampling, if argument is None.
             ci_score_prefix: An optional string prefix to the score_name in the CI.  Useful in cases where the
                 aggregation_func is something other than the mean
 
@@ -125,7 +125,6 @@ def score_based_confidence_interval(
             return result
         ci_score_prefix = str(ci_score_prefix)
         if aggregation_func is None:
-            # by default mean aggregation
             aggregation_func = self.average_instance_scores
 
         for score_name in score_names:
@@ -160,7 +159,11 @@ def statistic(arr, axis, score_name=score_name):
         return result
 
     def resample_from_non_nan(self, values):
-        """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones."""
+        """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones.
+
+        This makes it so that the bca confidence interval returned by bootstrap will not be NaN, since
+        bootstrap does not ignore NaNs.
+        """
         if values.size > 1:
             error_indices = numpy.isnan(values)
             n_errors = sum(error_indices)
@@ -203,30 +206,10 @@ def metric(sample_refs, sample_preds, sample_additional_inputs):
                 arr=arr,
             )
 
-            # when running with bca interval (default), the statistic is called twice: with the
-            # original data and with the resamples. here we want to focus only on the latter.
-            if scores.size > 1:
-                # here we deal with samples on which the metric could not be computed. These are
-                # edge cases - for example, when the sample contains only empty strings.
-                # CI is about the distribution around the statistic (e.g. mean), it doesn't deal with
-                # cases in which the metric is not computable. Therefore, we ignore these edge cases
-                # as part of the computation of CI. The question is how to implement this policy.
-                # Options:
-                # 1. skip the errors and return a shorter array => this fails because Scipy demans
-                # this callback (i.e. the statistic() callback) to return an array of the same size
-                # as the number of resamples
-                # 2. Put np.nan for the errors => this fails because in such case the ci itself
-                # becomes np.nan. So one edge case can fail the whole CI computation.
-                # 3. Replace the errors with a sampling from the successful cases => this is what
-                # is implemented.
-                error_indices = numpy.isnan(scores)
-                n_errors = sum(error_indices)
-                if n_errors > 0:
-                    new_scores = random_gen.choice(scores, n_errors, replace=True)
-                    scores = scores[~error_indices]
-                    scores = np.concatenate([scores, new_scores])
-
-            return scores
+            # in some resamplings of instances, the global score may be NaN; in these cases
+            # the bca confidence interval will be NaN because it does not ignore these values,
+            # so we replace any NaN values with those resampled from the non-NaN ones.
+            return self.resample_from_non_nan(scores)
 
         result = {}
         num_predictions = len(predictions)
@@ -508,10 +491,12 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
 
                 def aggregation_func(
-                    instances, field_name, field=reduction_params["agg_func"][1]
+                    instances,
+                    field_name,
+                    group_aggregation_func=reduction_params["agg_func"][1],
                 ):
                     return self.aggregate_instance_scores_by_group(
-                        instances, field_name, field
+                        instances, field_name, group_aggregation_func
                     )
             else:
                 raise ValueError(
@@ -568,7 +553,7 @@ def compute_instance_scores(
         return instances, global_score
 
     def aggregate_instance_scores_by_group(
-        self, instances, field_name, aggregation_func
+        self, instances, field_name, group_aggregation_func
     ):
         from collections import defaultdict
 
@@ -588,7 +573,8 @@ def aggregate_instance_scores_by_group(
             )
 
         group_total_scores = [
-            aggregation_func(scores) for scores in group_to_instance_scores.values()
+            group_aggregation_func(scores)
+            for scores in group_to_instance_scores.values()
         ]
         import warnings
 

From 4b5281d894f6565e2e27bcbf5178e6b0b7731e61 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 09:26:00 +0200
Subject: [PATCH 28/83] rename field as group_aggregation_func; use
 resample_from_non_nan in globalmetric confidence interval to ensure scores
 are not NaN

---
 src/unitxt/metrics.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 6b5e085c22..d4ad57f02e 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -161,8 +161,9 @@ def statistic(arr, axis, score_name=score_name):
     def resample_from_non_nan(self, values):
         """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones.
 
-        This makes it so that the bca confidence interval returned by bootstrap will not be NaN, since
-        bootstrap does not ignore NaNs.
+        This makes it so that, if possible, the bca confidence interval returned by bootstrap will not be NaN, since
+        bootstrap does not ignore NaNs.  However, if there are 0 or 1 non-NaN values, or all non-NaN values are equal,
+        the resulting distribution will be degenerate (only one unique value) so the CI will still be NaN
         """
         if values.size > 1:
             error_indices = numpy.isnan(values)

From f7eca817febd1d7d8b0e40212113cd83b7d63281 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 16:06:03 +0200
Subject: [PATCH 29/83] use same predictions and references for tokenoverlap as
 the other metrics

---
 prepare/metrics/grouped_instance_metrics.py | 77 +++++----------------
 1 file changed, 19 insertions(+), 58 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index ea764c536f..ed140ab9a9 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -226,7 +226,7 @@
 
 add_to_catalog(metric, "metrics.group_pdr_string_containment", overwrite=True)
 
-# Try Cohen's h instead of PDR since is symmetric and defined when baseline is 0
+
 metric = GroupNormCohensHAccuracy()
 global_target = {
     "group_norm_cohens_h_accuracy": -0.42,
@@ -274,77 +274,38 @@
 
 add_to_catalog(metric, "metrics.group_norm_cohens_h_string_containment", overwrite=True)
 
-
-# create references and predictions with only 3 unique values
-short_predictions = [
-    "A",
-    "B",
-    "B",
-    "A",
-    "B",
-    "B",
-    "A",
-    "A",
-    "B",
-    "B",
-    "A",
-    "B",
-    "A",
-    "A",
-    "B",
-]
-
-short_references = [
-    ["A", "B"],
-    ["A", "C"],
-    ["B", "C", "A"],
-    ["A"],
-    ["B", "A"],
-    ["C", "B"],
-    ["A"],
-    ["B", "C"],
-    ["A", "B", "C"],
-    ["A", "B"],
-    ["B", "C"],
-    ["C"],
-    ["C", "B"],
-    ["B", "A"],
-    ["B"],
-]
-
-
 global_target = {
-    "group_mean_f1": 0.5,
-    "score": 0.5,
+    "group_mean_f1": 0.51,
+    "score": 0.51,
     "score_name": "group_mean_f1",
-    "group_mean_f1_ci_low": 0.32,
-    "group_mean_f1_ci_high": 0.79,
-    "score_ci_low": 0.32,
-    "score_ci_high": 0.79,
+    "group_mean_f1_ci_low": 0.22,
+    "group_mean_f1_ci_high": 0.68,
+    "score_ci_low": 0.22,
+    "score_ci_high": 0.68,
     "group_mean_precision": 0.5,
-    "group_mean_precision_ci_low": 0.32,
-    "group_mean_precision_ci_high": 0.79,
-    "group_mean_recall": 0.5,
-    "group_mean_recall_ci_low": 0.32,
-    "group_mean_recall_ci_high": 0.79,
+    "group_mean_precision_ci_low": 0.21,
+    "group_mean_precision_ci_high": 0.67,
+    "group_mean_recall": 0.52,
+    "group_mean_recall_ci_low": 0.25,
+    "group_mean_recall_ci_high": 0.71,
 }
 
 instance_targets_token_overlap = [
-    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
-    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
     {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
     {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
     {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 0.5, "recall": 1.0, "f1": 0.67, "score": 0.67, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
+    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
     {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
     {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-    {"precision": 0, "recall": 0, "f1": 0, "score": 0, "score_name": "f1"},
-    {"precision": 1.0, "recall": 1.0, "f1": 1.0, "score": 1.0, "score_name": "f1"},
 ]
 
 
@@ -352,8 +313,8 @@
 
 outputs = test_metric(
     metric=metric,
-    predictions=short_predictions,
-    references=short_references,
+    predictions=[str(vv) for vv in predictions],
+    references=[[str(vvv) for vvv in vv] for vv in references],
     instance_targets=instance_targets_token_overlap,
     global_target=global_target,
     additional_inputs=additional_inputs,

From e25dedd9eb9243e567dcca90f9a82277db8dac2a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 16:06:53 +0200
Subject: [PATCH 30/83] add additional comments to resample_from_non_nan from
 original version

---
 src/unitxt/metrics.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index d4ad57f02e..a7e5dd0eb7 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -161,9 +161,24 @@ def statistic(arr, axis, score_name=score_name):
     def resample_from_non_nan(self, values):
         """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones.
 
-        This makes it so that, if possible, the bca confidence interval returned by bootstrap will not be NaN, since
+        here we deal with samples on which the metric could not be computed. These are
+        edge cases - for example, when the sample contains only empty strings.
+        CI is about the distribution around the statistic (e.g. mean), it doesn't deal with
+        cases in which the metric is not computable. Therefore, we ignore these edge cases
+        as part of the computation of CI.
+
+        In theory there would be several ways to deal with this:
+        1. skip the errors and return a shorter array => this fails because Scipy requires
+        this callback (i.e. the statistic() callback) to return an array of the same size
+        as the number of resamples
+        2. Put np.nan for the errors => this fails because in such case the ci itself
+        becomes np.nan. So one edge case can fail the whole CI computation.
+        3. Replace the errors with a sampling from the successful cases => this is what is implemented.
+
+        This resampling makes it so that, if possible, the bca confidence interval returned by bootstrap will not be NaN, since
         bootstrap does not ignore NaNs.  However, if there are 0 or 1 non-NaN values, or all non-NaN values are equal,
-        the resulting distribution will be degenerate (only one unique value) so the CI will still be NaN
+        the resulting distribution will be degenerate (only one unique value) so the CI will still be NaN since there is
+        no variability.  In this case, the CI is essentially an interval of length 0 equaling the mean itself.
         """
         if values.size > 1:
             error_indices = numpy.isnan(values)
@@ -197,6 +212,7 @@ def metric(sample_refs, sample_preds, sample_additional_inputs):
                     logger.info(f"Warning in {self.__class__.__name__}", e)
                     return np.nan
 
+            # resample the instance scores, and then return the global score each time
             scores = numpy.apply_along_axis(
                 lambda x: metric(
                     sample_refs=[references[i] for i in x],
@@ -207,8 +223,8 @@ def metric(sample_refs, sample_preds, sample_additional_inputs):
                 arr=arr,
             )
 
-            # in some resamplings of instances, the global score may be NaN; in these cases
-            # the bca confidence interval will be NaN because it does not ignore these values,
+            # in some resamplings of instances, the global score may be NaN since it cannot be computed;
+            # in these cases, the bca confidence interval will be NaN because it does not ignore these values,
             # so we replace any NaN values with those resampled from the non-NaN ones.
             return self.resample_from_non_nan(scores)
 
@@ -1914,7 +1930,7 @@ def performance_drop_rate(instance_scores: List):
 def normalized_cohens_h(instance_scores: List):
     """Cohen's h between two proportions.
 
-    Allows for change-type metric when the baseline is 0 (percentage change is undefined)
+    Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
     https://en.wikipedia.org/wiki/Cohen%27s_h
 
     Cohen's h effect size metric between two proportions p2 and p1 is 2 * (arcsin(sqrt(p2)) - arcsin(sqrt(p1))).

From 438aee263f105171fb4fe282b1756efefa11be00 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 16:09:46 +0200
Subject: [PATCH 31/83] use same references and predictions for tokenoverlap as
 for other grouped instance metrics; remove np.nan conditions on CIs since
 doesn't happen in our examples.

---
 tests/test_metrics.py | 109 +++++++-----------------------------------
 1 file changed, 16 insertions(+), 93 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 6850fc2a29..325449957e 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -31,14 +31,14 @@
     "C",
     "123",
     "BCD",
-    10,
+    "10",
     "  BD",
     "AB",
     "I am a dog",
     "AB C",
     "AB 1",
     "GMA",
-    0.123,
+    "0.123",
     "BD",
     "abc",
 ]
@@ -47,56 +47,20 @@
     ["B", "AB", "A"],
     ["A", "BC D", "BC DF"],
     ["c", " C"],
-    [13, 23, 234],
+    ["13", "23", "234"],
     ["  ", " BD", " BDA"],
-    [1, 10, 100],
+    ["1", "10", "100"],
     ["A", "B", "BD"],
     ["ABC", "ab", "BC"],
     ["I am a person", "I AM A DOG", "ABC"],
     ["AB CD", "AB", "ab"],
     ["AB 1", "AB1"],
     [" GMA 123", "GMA"],
-    ["123", 0.12],
+    ["123", "0.12"],
     ["BDE", "BCE", "bdefs"],
     [" abcdefg", "AB", "abcd"],
 ]
 
-GROUPED_INSTANCE_PREDICTIONS_SHORT = [
-    "A",
-    "B",
-    "B",
-    "A",
-    "B",
-    "B",
-    "A",
-    "A",
-    "B",
-    "B",
-    "A",
-    "B",
-    "A",
-    "A",
-    "B",
-]
-
-GROUPED_INSTANCE_REFERENCES_SHORT = [
-    ["A", "B"],
-    ["A", "C"],
-    ["B", "C", "A"],
-    ["A"],
-    ["B", "A"],
-    ["C", "B"],
-    ["A"],
-    ["B", "C"],
-    ["A", "B", "C"],
-    ["A", "B"],
-    ["B", "C"],
-    ["C"],
-    ["C", "B"],
-    ["B", "A"],
-    ["B"],
-]
-
 # possibly multi-column group identifier
 GROUPED_INSTANCE_ADDL_INPUTS = (
     [{"group": "grp1", "id": 0}] * 5
@@ -488,6 +452,7 @@ def test_grouped_instance_metrics(self):
             GroupPDRStringContainment(),
             GroupNormCohensHAccuracy(),
             GroupNormCohensHStringContainment(),
+            GroupMeanTokenOverlap(),
         ]
         global_targets = [
             0.225,
@@ -496,6 +461,7 @@ def test_grouped_instance_metrics(self):
             0.4444444444444445,
             -0.4249467048786864,
             -0.4639421840102023,
+            0.5083333333333333,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -507,22 +473,7 @@ def test_grouped_instance_metrics(self):
             self.assertAlmostEqual(
                 target,
                 outputs[0]["score"]["global"]["score"],
-                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
-            )
-
-        f1_metrics = [GroupMeanTokenOverlap()]
-        global_targets = [0.5]
-        for metric, target in zip(f1_metrics, global_targets):
-            outputs = apply_metric(
-                metric=metric,
-                predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
-                references=GROUPED_INSTANCE_REFERENCES_SHORT,
-                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
-            )
-            self.assertAlmostEqual(
-                target,
-                outputs[0]["score"]["global"]["score"],
-                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value",
+                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
             )
 
     def test_grouped_instance_metric_errors(self):
@@ -735,26 +686,10 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=-0.3908330554711398,
         )
 
-        # F1-based scores
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanTokenOverlap(),
-            references=GROUPED_INSTANCE_REFERENCES_SHORT,
-            predictions=GROUPED_INSTANCE_PREDICTIONS_SHORT,
-            expected_global_result={
-                "group_mean_f1": 0.5,
-                "score": 0.5,
-                "score_name": "group_mean_f1",
-                "group_mean_f1_ci_low": 0.32222222222222224,
-                "group_mean_f1_ci_high": 0.7900160821100434,
-                "score_ci_low": 0.32222222222222224,
-                "score_ci_high": 0.7900160821100434,
-                "group_mean_precision": 0.5,
-                "group_mean_precision_ci_low": 0.32222222222222224,
-                "group_mean_precision_ci_high": 0.7900160821100434,
-                "group_mean_recall": 0.5,
-                "group_mean_recall_ci_low": 0.32222222222222224,
-                "group_mean_recall_ci_high": 0.7900160821100434,
-            },
+            expected_ci_low=0.22302503471948287,
+            expected_ci_high=0.6805555555555555,
         )
 
     def _test_grouped_instance_confidence_interval(
@@ -767,8 +702,6 @@ def _test_grouped_instance_confidence_interval(
         expected_global_result=None,
     ):
         """Test the calculation of confidence intervals for a given metric with group_mean reduction."""
-        import numpy as np
-
         outputs = apply_metric(
             metric=metric,
             predictions=predictions,
@@ -796,22 +729,12 @@ def _test_grouped_instance_confidence_interval(
         logger.info(global_result)
         for score_name, score_value in global_result.items():
             if score_name in expected_global_result:
-                # Test that the output value is the same as the expected value
-                # allow for cases where value is NaN
-                if not isinstance(score_value, str):
-                    if np.isnan(expected_global_result[score_name]):
-                        assert np.isnan(score_value)
-                    elif np.isnan(score_value):
-                        assert np.isnan(expected_global_result[score_name])
-                    else:
-                        self.assertAlmostEqual(
-                            score_value,
-                            expected_global_result[score_name],
-                            places=5,
-                            msg=f"score mismatch for {group_score_name}",
-                        )
-                else:
-                    self.assertEqual(score_value, expected_global_result[score_name])
+                self.assertAlmostEqual(
+                    score_value,
+                    expected_global_result[score_name],
+                    places=5,
+                    msg=f"score mismatch for {group_score_name}, got {expected_global_result[score_name]} but expected {score_value}",
+                )
             else:
                 # An output score that is not expected
                 # This is ok if the score_name is not related to confidence intervals

From 9cb48dc3dd381820b725c1ca0ce24d7e5838b66b Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 23 Jan 2024 17:25:37 +0200
Subject: [PATCH 32/83] return global result to CI test for grouped instance
 because of tokenoverlap

---
 tests/test_metrics.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 325449957e..3bf72896bd 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -686,10 +686,24 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=-0.3908330554711398,
         )
 
+        # pass global dict because there are additional fields other than the main score
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanTokenOverlap(),
-            expected_ci_low=0.22302503471948287,
-            expected_ci_high=0.6805555555555555,
+            expected_global_result={
+                "group_mean_recall": 0.525,
+                "group_mean_f1": 0.5083333333333333,
+                "score": 0.5083333333333333,
+                "score_name": "group_mean_f1",
+                "group_mean_precision": 0.5,
+                "group_mean_recall_ci_low": 0.25,
+                "group_mean_recall_ci_high": 0.7083333333333334,
+                "group_mean_f1_ci_low": 0.22302503471948287,
+                "group_mean_f1_ci_high": 0.6805555555555555,
+                "score_ci_low": 0.22302503471948287,
+                "score_ci_high": 0.6805555555555555,
+                "group_mean_precision_ci_low": 0.20949399775845196,
+                "group_mean_precision_ci_high": 0.6666666666666666,
+            },
         )
 
     def _test_grouped_instance_confidence_interval(

From 288e29c905ebbebaa710887a7a5f8f5136fd1ff8 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 24 Jan 2024 17:18:02 +0200
Subject: [PATCH 33/83] add interpretation option and comment to cohen's h

---
 src/unitxt/metrics.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 73025f70fd..e4cc76be16 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -1942,23 +1942,30 @@ def performance_drop_rate(instance_scores: List):
     )
 
 
-def normalized_cohens_h(instance_scores: List):
-    """Cohen's h between two proportions.
+def normalized_cohens_h(instance_scores: List, interpret=False):
+    """Cohen's h between two proportions, normalized to interval [-1,1].
 
     Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
     https://en.wikipedia.org/wiki/Cohen%27s_h
 
     Cohen's h effect size metric between two proportions p2 and p1 is 2 * (arcsin(sqrt(p2)) - arcsin(sqrt(p1))).
-    h in -pi, pi, with +/- representing the largest increase/decrease (p1=0, p2=1), or (p1=1, p2=0).
-    h=0 is no change. Unlike percentage change, h is defined if the baseline (p1) is 0.
+    h in -pi, pi, with +/-pi representing the largest increase/decrease (p1=0, p2=1), or (p1=1, p2=0).
+    h=0 is no change. Unlike percentage change, h is defined even if the baseline (p1) is 0.
     Assumes the scores are in [0,1], either continuous or binary.
     For scores in a list, the first element is treated as the baseline p1, and the mean of the others
     as p2, and evaluates p2 change relative to p1.  It is thus undefined if the list is of length < 2.
     We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
 
+    Interpretation: the original unscaled Cohen's h can be interepreted as
+        - an insignificant difference if 0 < |h| < 0.2
+        - small difference if 0.2 <= |h| < 0.5
+        - a medium difference if 0.5 <= |h| < 0.8
+        - a large difference if 0.8 <= |h|
+    Thus, the rule of interpreting the effect of the normalized value is to use the same thresholds divided by pi
+
     Args:
         instance_scores: a list of scores on instances.  Assume the first element is the original, the others are test set
-
+        interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score between -1 and 1
     """
@@ -1974,7 +1981,19 @@ def normalized_cohens_h(instance_scores: List):
     baseline_p = instance_scores[0]
     new_p = np.nanmean(instance_scores[1:])
     h = 2 * (np.arcsin(np.sqrt(new_p)) - np.arcsin(np.sqrt(baseline_p)))
-    return np.clip(a=h / np.pi, a_min=-1, a_max=1)
+    norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
+    if not interpret:
+        return norm_h
+
+    import pandas as pd
+
+    how_signif = pd.cut(
+        x=[np.abs(h)],
+        right=False,
+        bins=[-1, 0.2, 0.5, 0.8, np.Inf],
+        labels=["not significant", "small", "medium", "large"],
+    )
+    return norm_h, how_signif[0]
 
 
 class GroupMeanAccuracy(Accuracy):

From 1535e85b8c38930f0a03cf80a3e9054a4119cf28 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 28 Jan 2024 15:34:03 +0200
Subject: [PATCH 34/83] add group_mean_subgroup_comparison reduction to
 InstanceMetric; update CIs for Cohen's h and PDR classes that were
 incorrectly specified before

---
 prepare/metrics/grouped_instance_metrics.py | 110 ++++-----
 src/unitxt/metrics.py                       | 255 ++++++++++++++++----
 tests/test_metrics.py                       |  50 ++--
 3 files changed, 300 insertions(+), 115 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index ed140ab9a9..9b87dd6a44 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -1,3 +1,5 @@
+from copy import deepcopy
+
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
     GroupMeanAccuracy,
@@ -46,62 +48,51 @@
     [" abcdefg", "AB", "abcd"],
 ]
 
-# possibly multi-column group identifier
+# possibly multi-column group identifier; 'ignore' is unused
+# use deepcopy so that dicts in list are independent and can be updated separately
 additional_inputs = (
-    [{"group": "grp1", "id": 0, "ignore": 1}] * 5
-    + [{"group": "grp1", "id": 1, "ignore": 1}] * 5
-    + [{"group": "grp2", "id": 0, "ignore": 1}] * 4
-    + [{"group": "grp2", "id": 1, "ignore": 0}] * 1
+    [deepcopy({"group": "grp1", "id": 0, "ignore": 1}) for _ in range(5)]
+    + [deepcopy({"group": "grp1", "id": 1, "ignore": 1}) for _ in range(5)]
+    + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
+    + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
+)
+# for group_mean_subgroup_comparison metrics, add a subgroup indicator (by default called 'variant_type')
+# these groupings correspond in length to the group identifiers above
+variant_type = (
+    (["original"] + ["paraphrase"] * 4)
+    + (["original"] + ["paraphrase"] * 4)
+    + (["original"] + ["paraphrase"] * 3)
+    + ["original"]
 )
 
-group_by_fields = ["group", "id"]
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
-for ai in additional_inputs:
-    ai.update({"group_id": "_".join([str(ai[ff]) for ff in group_by_fields])})
+group_by_fields = ["group", "id"]
+
+for ai, vt in zip(additional_inputs, variant_type):
+    ai.update(
+        {
+            "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
+            "variant_type": vt,
+        }
+    )
 
 
 instance_targets_string_containment = [
     {"score": 1.0},
     {"score": 1.0},
-    {
-        "score": 0.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 0.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 0.0,
-    },
-    {
-        "score": 0.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 1.0,
-    },
-    {
-        "score": 0.0,
-    },
-    {
-        "score": 0.0,
-    },
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 0.0},
+    {"score": 1.0},
+    {"score": 1.0},
+    {"score": 1.0},
+    {"score": 1.0},
+    {"score": 0.0},
+    {"score": 0.0},
 ]
 
 for instance in instance_targets_string_containment:
@@ -126,7 +117,6 @@
     {"score": 0.0},
     {"score": 0.0},
 ]
-
 for instance in instance_targets_accuracy:
     instance.update({"accuracy": instance["score"], "score_name": "accuracy"})
 
@@ -184,9 +174,9 @@
     "group_pdr_accuracy": 0.83,
     "score": 0.83,
     "score_name": "group_pdr_accuracy",
-    "score_ci_low": 0.38,
+    "score_ci_low": 0.0,
     "score_ci_high": 1.0,
-    "group_pdr_accuracy_ci_low": 0.38,
+    "group_pdr_accuracy_ci_low": 0.0,
     "group_pdr_accuracy_ci_high": 1.0,
 }
 
@@ -208,9 +198,9 @@
     "group_pdr_string_containment": 0.44,
     "score": 0.44,
     "score_name": "group_pdr_string_containment",
-    "score_ci_low": 0.14,
+    "score_ci_low": 0.0,
     "score_ci_high": 1.0,
-    "group_pdr_string_containment_ci_low": 0.14,
+    "group_pdr_string_containment_ci_low": 0.0,
     "group_pdr_string_containment_ci_high": 1.0,
 }
 
@@ -232,10 +222,10 @@
     "group_norm_cohens_h_accuracy": -0.42,
     "score": -0.42,
     "score_name": "group_norm_cohens_h_accuracy",
-    "score_ci_low": -0.92,
-    "score_ci_high": -0.33,
-    "group_norm_cohens_h_accuracy_ci_low": -0.92,
-    "group_norm_cohens_h_accuracy_ci_high": -0.33,
+    "score_ci_low": -1.0,
+    "score_ci_high": 0.5,
+    "group_norm_cohens_h_accuracy_ci_low": -1.0,
+    "group_norm_cohens_h_accuracy_ci_high": 0.5,
 }
 
 
@@ -256,10 +246,10 @@
     "group_norm_cohens_h_string_containment": -0.46,
     "score": -0.46,
     "score_name": "group_norm_cohens_h_string_containment",
-    "score_ci_low": -0.74,
-    "score_ci_high": -0.39,
-    "group_norm_cohens_h_string_containment_ci_low": -0.74,
-    "group_norm_cohens_h_string_containment_ci_high": -0.39,
+    "score_ci_low": -1.0,
+    "score_ci_high": 0.0,
+    "group_norm_cohens_h_string_containment_ci_low": -1.0,
+    "group_norm_cohens_h_string_containment_ci_high": 0.0,
 }
 
 
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index e4cc76be16..7082dd10dc 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -424,13 +424,16 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
 
     implemented_reductions: List[str] = field(
-        default_factory=lambda: ["mean", "group_mean"]
+        default_factory=lambda: ["mean", "group_mean", "group_mean_subgroup_comparison"]
     )
 
-    # InstanceMetric currently allows two reductions, 'mean', which calculates the mean of instance scores,'
-    # and 'group_mean', which first applies an aggregation function specified in the reduction_map
-    # to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
-    # of the group scores; if grouping_field is None, grouping is disabled.
+    # InstanceMetric currently allows three reductions:
+    # 1. 'mean', which calculates the mean of instance scores,'
+    # 2. 'group_mean', which first applies an aggregation function specified in the reduction_map
+    #    to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
+    #    of the group scores; if grouping_field is None, grouping is disabled.
+    # 3. 'group_mean_subgroup_comparison': compare sub-groups (e.g. a baseline and others) within
+    #    groups, then return the mean of this function value.
     # see _validate_group_mean_reduction for an example and proper formatting of the reduction_map
     grouping_field: str = None
 
@@ -439,7 +442,7 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     def reduction_map(self) -> dict:
         pass
 
-    def _validate_group_mean_reduction(self):
+    def _validate_group_mean_reduction(self, reduction_name="group_mean"):
         """Ensure that group_mean reduction_map is properly formatted.
 
         Example: Apply the variance (np.var) to group Accuracy instance scores.  This class would be specified as follows:
@@ -454,6 +457,9 @@ class GroupVarianceAccuracy(Accuracy):
             - 2nd element is the callable aggregation function
         - Optional: 'score_fields' key with list value containing the string names of fields to apply the aggregation to
             - if not present, the parent class main_score is used.
+
+        Verify that instances do not contain a field _index, which is used to sort to make
+        sure that resampling preserves the
         """
         if not self.grouping_field:
             raise ValueError(
@@ -462,14 +468,14 @@ class GroupVarianceAccuracy(Accuracy):
             )
 
         assert (
-            "group_mean" in self.reduction_map
-        ), "reduction_map must have a `group_mean' key"
-        fields = self.reduction_map["group_mean"]
+            reduction_name in self.reduction_map
+        ), f"reduction_map must have a `{reduction_name}' key"
+        fields = self.reduction_map[reduction_name]
         # for group_mean, expects a dict
         assert isinstance(fields, dict)
         assert (
             "agg_func" in fields
-        ), "fields should have a key 'agg_func' consisting of a 2-element list of a function name and function definition"
+        ), "fields should have a key 'agg_func' whose value is a 2-element list of a function name and function definition"
         assert isinstance(
             fields["agg_func"], list
         ), "fields['agg_func'] should be a list"
@@ -485,6 +491,72 @@ class GroupVarianceAccuracy(Accuracy):
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
+    def _validate_group_mean_subgroup_comparison_reduction(self):
+        """Ensure that group_mean_subgroup_comparison reduction_map is properly formatted.
+
+        Example: given a set of Accuracy instance scores, where the instances are grouped by grouping_field.
+        Assume that the instances in each group belong to two sub-groups, 'baseline' and 'others'.
+        An example is that we have an original dataset consisting of questions to be answered.  A second dataset,
+        which we are evaluating, consists of the original question and multiple paraphrases or perturbations of it.
+        Thus, a group is defined by the original question (the 'baseline') plus the paraphrases ('others').
+        An aggregation function here does not simply receive as inputs the set of all the group's instance score (e.g.,
+        the average of all of them) but rather wants to make a comparison between the 'other' and 'baseline' scores.
+        For instance, return the difference between the baseline score and the average of the 'others' score.
+
+        This reduction must have the same format as group_mean reduction map (see validate_group_mean_reduction) in
+        terms of the 'agg_func' field, but must also have an additional field called 'baseline'.
+        This field value is a list of two strings, the first a name of a column in the input dataset (fed into additional_inputs)
+        and the second the value in that column that indicates the baseline items.
+        The callable function must accept parameters baseline_scores and other_scores.  An example is
+
+        class GroupVsBaselineDiffAccuracy(Accuracy):
+            grouping_field = 'group_id'
+            reduction_map = {'group_mean_subgroup_comparison': {'agg_func': ['accuracy_diff', accuracy_diff],
+                                                                        'subgroups': ['variant_type', 'original']}
+                                                                }
+        # where the function is defined as
+        def accuracy_diff(baseline_scores, other_scores):
+            from statistics import mean
+            return mean(other_scores) - mean(baseline_scores)
+
+        The input dataset should look like:
+
+        'group_id'  'question'                                  'variant_type'
+        1           'How do you fix a car engine?'               original
+        1           'What is the best way to fix an engine?'     paraphrase
+        1           'How do you repair a car engine?'            paraphrase
+        1           'How do I repair my engine?'                 paraphrase
+        2           'Why are ants eating my food?'               original
+        ...
+
+        """
+        reduction_name = "group_mean_subgroup_comparison"
+        self._validate_group_mean_reduction(reduction_name=reduction_name)
+        # make sure aggregation function contains appropriate arguments
+        import inspect
+
+        agg_func = self.reduction_map[reduction_name]["agg_func"][1]
+        func_args = list(inspect.signature(agg_func).parameters.keys())
+        required_args = ["baseline_scores", "other_scores"]
+        assert all(
+            kk in func_args for kk in required_args
+        ), f"aggregation function {agg_func.__name} must accept parameters {required_args}"
+
+        # validate baseline arguments
+        fields = self.reduction_map[reduction_name]
+        assert (
+            "subgroups" in fields
+        ), "fields should have a key 'subgroups' whose value is a 2-element list of strings, a data column name and value identifier"
+        assert isinstance(
+            fields["subgroups"], list
+        ), "fields['subgroups'] should be a list"
+        assert (
+            len(fields["subgroups"]) == 2
+        ), "fields['subgroups'] should be a two-element list"
+        assert all(
+            isinstance(vv, str) for vv in fields["subgroups"]
+        ), "both elements in fields['subgroups'] should be a strings"
+
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
 
@@ -515,6 +587,31 @@ def aggregation_func(
                     return self.aggregate_instance_scores_by_group(
                         instances, field_name, group_aggregation_func
                     )
+            elif reduction_type == "group_mean_subgroup_comparison":
+                self._validate_group_mean_subgroup_comparison_reduction()
+                # same setup as group_mean reduction
+                reduction_fields = (
+                    [self.main_score]
+                    if "score_fields" not in reduction_params
+                    else list(set(reduction_params["score_fields"]))
+                )
+                aggregation_function_name = str(reduction_params["agg_func"][0])
+                field_name_full_prefix = "group_" + aggregation_function_name + "_"
+
+                def aggregation_func(
+                    instances,
+                    field_name,
+                    group_aggregation_func=reduction_params["agg_func"][1],
+                    subgroup_field=reduction_params["subgroups"][0],
+                    baseline_name=reduction_params["subgroups"][1],
+                ):
+                    return self.aggregate_instance_scores_by_group_subgroups(
+                        instances,
+                        field_name,
+                        group_aggregation_func,
+                        subgroup_field=subgroup_field,
+                        baseline_name=baseline_name,
+                    )
             else:
                 raise ValueError(
                     f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
@@ -603,6 +700,50 @@ def aggregate_instance_scores_by_group(
             warnings.simplefilter("ignore", category=RuntimeWarning)
             return np.nanmean(group_total_scores)
 
+    def aggregate_instance_scores_by_group_subgroups(
+        self,
+        instances,
+        field_name,
+        group_aggregation_func,
+        subgroup_field,
+        baseline_name,
+    ):
+        from collections import defaultdict
+
+        # first list is instance scores for baseline group, second is for comparison group
+        group_to_instance_scores = defaultdict(lambda: [[], []])
+        for instance in instances:
+            additional_inputs = instance["additional_inputs"]
+            for cc in [self.grouping_field, subgroup_field]:
+                if cc not in additional_inputs:
+                    raise ValueError(
+                        f"Missing '{cc}' from instance {instance}. "
+                        f"This field is required for group based metric computation."
+                    )
+            group_key = additional_inputs[self.grouping_field]
+            # indicator if is in the baseline group or not
+            is_baseline = str(additional_inputs[subgroup_field]) == baseline_name
+            # convert True (baseline) to 0, and False (others) to 1, store in respective groups
+            idx = int(not is_baseline)
+            group_to_instance_scores[group_key][idx].append(
+                instance["score"]["instance"][field_name]
+            )
+
+        # now for each group, take the aggregation function, comparing others to baseline
+        group_total_scores = [
+            group_aggregation_func(baseline_scores=scores[0], other_scores=scores[1])
+            for scores in group_to_instance_scores.values()
+        ]
+        import warnings
+
+        with warnings.catch_warnings():
+            # final mean should be mean of group_total_score, ignoring NaN, hence nanmean
+            # but if the group function values is NaN for ALL groups, nanmean throws a
+            # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
+            # this is the desired behavior, but we want to avoid the warning here
+            warnings.simplefilter("ignore", category=RuntimeWarning)
+            return np.nanmean(group_total_scores)
+
     @abstractmethod
     def compute(
         self, references: List[Any], prediction: Any, additional_inputs: Dict
@@ -1920,13 +2061,22 @@ def should_ignore_element(self, element, additional_input):
 
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
-def performance_drop_rate(instance_scores: List):
-    """Percentage change of mean performance on test elements relative to that on a baseline.
+def validate_baseline_other_aggregation(baseline_scores, other_scores):
+    assert isinstance(baseline_scores, list)
+    assert isinstance(other_scores, list)
+    baseline_scores = [vv for vv in baseline_scores if not np.isnan(vv)]
+    other_scores = [vv for vv in other_scores if not np.isnan(vv)]
+    return baseline_scores, other_scores
+
+
+def performance_drop_rate(baseline_scores: List, other_scores: List):
+    """Percentage decrease of mean performance on test elements relative to that on a baseline.
 
     from https://arxiv.org/pdf/2306.04528.pdf.
 
     Args:
-        instance_scores: a list of scores on instances.  Assume the first element is the original, the others are test set
+        baseline_scores: a list of scores on baseline instances.
+        other_scores: a list of scores on instances that will be compared to the baseline.
 
     Returns:
         numeric PDR metric.
@@ -1934,15 +2084,18 @@ def performance_drop_rate(instance_scores: List):
         otherwise, calculate PDR
 
     """
-    assert isinstance(instance_scores, list)
-    return (
-        np.nan
-        if (len(instance_scores) < 2 or instance_scores[0] == 0)
-        else 1 - mean(instance_scores[1:]) / instance_scores[0]
+    baseline_scores, other_scores = validate_baseline_other_aggregation(
+        baseline_scores, other_scores
     )
+    if len(baseline_scores) == 0 or len(other_scores) == 0:
+        # no comparison can be made since there is not at least one score per type
+        return np.nan
+    baseline_mean = mean(baseline_scores)
+    other_mean = mean(other_scores)
+    return np.nan if baseline_mean == 0 else 1 - other_mean / baseline_mean
 
 
-def normalized_cohens_h(instance_scores: List, interpret=False):
+def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=False):
     """Cohen's h between two proportions, normalized to interval [-1,1].
 
     Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
@@ -1951,12 +2104,10 @@ def normalized_cohens_h(instance_scores: List, interpret=False):
     Cohen's h effect size metric between two proportions p2 and p1 is 2 * (arcsin(sqrt(p2)) - arcsin(sqrt(p1))).
     h in -pi, pi, with +/-pi representing the largest increase/decrease (p1=0, p2=1), or (p1=1, p2=0).
     h=0 is no change. Unlike percentage change, h is defined even if the baseline (p1) is 0.
-    Assumes the scores are in [0,1], either continuous or binary.
-    For scores in a list, the first element is treated as the baseline p1, and the mean of the others
-    as p2, and evaluates p2 change relative to p1.  It is thus undefined if the list is of length < 2.
-    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
+    Assumes the scores are in [0,1], either continuous or binary; hence taking the average of a group of scores yields a proportion..
+    Calculates the change in the average of the other_scores relative to the average of the baseline_scores.    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
 
-    Interpretation: the original unscaled Cohen's h can be interepreted as
+    Interpretation: the original unscaled Cohen's h can be interpreted as
         - an insignificant difference if 0 < |h| < 0.2
         - small difference if 0.2 <= |h| < 0.5
         - a medium difference if 0.5 <= |h| < 0.8
@@ -1964,23 +2115,25 @@ def normalized_cohens_h(instance_scores: List, interpret=False):
     Thus, the rule of interpreting the effect of the normalized value is to use the same thresholds divided by pi
 
     Args:
-        instance_scores: a list of scores on instances.  Assume the first element is the original, the others are test set
+        baseline_scores: a list of scores on baseline instances.
+        other_scores: a list of scores on instances that will be compared to the baseline.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
-        float score between -1 and 1
+        float score between -1 and 1, and a string interpretation if interpret=True
     """
-    assert isinstance(instance_scores, list)
-    assert all(
-        0 <= score <= 1 for score in instance_scores
-    ), "all scores must be in [0,1]"
-
-    if len(instance_scores) < 2:
-        # needs at least 2 elements
+    baseline_scores, other_scores = validate_baseline_other_aggregation(
+        baseline_scores, other_scores
+    )
+    if len(baseline_scores) == 0 or len(other_scores) == 0:
+        # no comparison can be made since there is not at least one score per type
         return np.nan
-    # assumes first element is the baseline proportion
-    baseline_p = instance_scores[0]
-    new_p = np.nanmean(instance_scores[1:])
-    h = 2 * (np.arcsin(np.sqrt(new_p)) - np.arcsin(np.sqrt(baseline_p)))
+    for score_list in zip(baseline_scores, other_scores):
+        assert all(
+            0 <= score <= 1 for score in score_list
+        ), "all scores must be in [0,1]"
+    baseline_mean = mean(baseline_scores)
+    other_mean = mean(other_scores)
+    h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
     norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
     if not interpret:
         return norm_h
@@ -2003,7 +2156,12 @@ class GroupMeanAccuracy(Accuracy):
 
 class GroupPDRAccuracy(Accuracy):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
+    reduction_map = {
+        "group_mean_subgroup_comparison": {
+            "agg_func": ["pdr", performance_drop_rate],
+            "subgroups": ["variant_type", "original"],
+        }
+    }
 
 
 class GroupMeanStringContainment(StringContainment):
@@ -2013,7 +2171,12 @@ class GroupMeanStringContainment(StringContainment):
 
 class GroupPDRStringContainment(StringContainment):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["pdr", performance_drop_rate]}}
+    reduction_map = {
+        "group_mean_subgroup_comparison": {
+            "agg_func": ["pdr", performance_drop_rate],
+            "subgroups": ["variant_type", "original"],
+        }
+    }
 
 
 class GroupMeanTokenOverlap(TokenOverlap):
@@ -2028,9 +2191,19 @@ class GroupMeanTokenOverlap(TokenOverlap):
 
 class GroupNormCohensHAccuracy(Accuracy):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["norm_cohens_h", normalized_cohens_h]}}
+    reduction_map = {
+        "group_mean_subgroup_comparison": {
+            "agg_func": ["norm_cohens_h", normalized_cohens_h],
+            "subgroups": ["variant_type", "original"],
+        }
+    }
 
 
 class GroupNormCohensHStringContainment(StringContainment):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["norm_cohens_h", normalized_cohens_h]}}
+    reduction_map = {
+        "group_mean_subgroup_comparison": {
+            "agg_func": ["norm_cohens_h", normalized_cohens_h],
+            "subgroups": ["variant_type", "original"],
+        }
+    }
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 3bf72896bd..6fb6b2ac65 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -1,4 +1,5 @@
 import unittest
+from copy import deepcopy
 from math import isnan
 
 from src.unitxt.logging_utils import get_logger
@@ -63,15 +64,31 @@
 
 # possibly multi-column group identifier
 GROUPED_INSTANCE_ADDL_INPUTS = (
-    [{"group": "grp1", "id": 0}] * 5
-    + [{"group": "grp1", "id": 1}] * 5
-    + [{"group": "grp2", "id": 0}] * 4
-    + [{"group": "grp2", "id": 1}] * 1
+    [deepcopy({"group": "grp1", "id": 0, "ignore": 1}) for _ in range(5)]
+    + [deepcopy({"group": "grp1", "id": 1, "ignore": 1}) for _ in range(5)]
+    + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
+    + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
 )
-group_by_fields = ["group", "id"]
+
+# for group_mean_subgroup_comparison metrics, add a subgroup indicator (by default called 'variant_type')
+# these groupings correspond in length to the group identifiers above
+VARIANT_TYPE = (
+    (["original"] + ["paraphrase"] * 4)
+    + (["original"] + ["paraphrase"] * 4)
+    + (["original"] + ["paraphrase"] * 3)
+    + ["original"]
+)
+
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
-for ai in GROUPED_INSTANCE_ADDL_INPUTS:
-    ai.update({"group_id": "_".join([str(ai[ff]) for ff in group_by_fields])})
+group_by_fields = ["group", "id"]
+
+for ai, vt in zip(GROUPED_INSTANCE_ADDL_INPUTS, VARIANT_TYPE):
+    ai.update(
+        {
+            "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
+            "variant_type": vt,
+        }
+    )
 
 
 class TestMetrics(unittest.TestCase):
@@ -662,28 +679,32 @@ def test_grouped_instance_metric_confidence_interval(self):
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupPDRAccuracy(),
-            expected_ci_low=0.375,
+            expected_ci_low=0.0,
             expected_ci_high=1.0,
+            reduction_name="group_mean_subgroup_comparison",
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupPDRStringContainment(),
-            expected_ci_low=0.14285714285714288,
+            expected_ci_low=0.0,
             expected_ci_high=1.0,
+            reduction_name="group_mean_subgroup_comparison",
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHAccuracy(),
-            expected_ci_low=-0.9232678869571689,
-            expected_ci_high=-0.3333333333333333,
+            expected_ci_low=-1.0,
+            expected_ci_high=0.5000000000000001,
+            reduction_name="group_mean_subgroup_comparison",
         )
 
         # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results
         # hence this test may fail on a PC
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHStringContainment(),
-            expected_ci_low=-0.743586957620825,
-            expected_ci_high=-0.3908330554711398,
+            expected_ci_low=-1.0,
+            expected_ci_high=0.0,
+            reduction_name="group_mean_subgroup_comparison",
         )
 
         # pass global dict because there are additional fields other than the main score
@@ -714,6 +735,7 @@ def _test_grouped_instance_confidence_interval(
         references=GROUPED_INSTANCE_REFERENCES,
         predictions=GROUPED_INSTANCE_PREDICTIONS,
         expected_global_result=None,
+        reduction_name="group_mean",
     ):
         """Test the calculation of confidence intervals for a given metric with group_mean reduction."""
         outputs = apply_metric(
@@ -726,7 +748,7 @@ def _test_grouped_instance_confidence_interval(
         group_score_name = "_".join(
             [
                 "group",
-                metric.reduction_map["group_mean"]["agg_func"][0],
+                metric.reduction_map[reduction_name]["agg_func"][0],
                 metric.main_score,
             ]
         )

From 8650e9b3b1ec8e52addcda64bdf5a67574fcbf1d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 29 Jan 2024 22:58:19 +0200
Subject: [PATCH 35/83] modify test_grouped_instance_metric_errors to take into
 account boolean third field in reduction. Modify confidence intervals
 according to fixed grouping or not.

---
 tests/test_metrics.py | 49 ++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 6fb6b2ac65..51a77ca8e0 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -10,6 +10,7 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
+    FixedGroupMeanAccuracy,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
@@ -463,6 +464,7 @@ def test_token_overlap(self):
 
     def test_grouped_instance_metrics(self):
         accuracy_metrics = [
+            FixedGroupMeanAccuracy(),
             GroupMeanAccuracy(),
             GroupMeanStringContainment(),
             GroupPDRAccuracy(),
@@ -472,6 +474,7 @@ def test_grouped_instance_metrics(self):
             GroupMeanTokenOverlap(),
         ]
         global_targets = [
+            0.225,
             0.225,
             0.4875,
             0.8333333333333334,
@@ -498,7 +501,7 @@ def test_grouped_instance_metric_errors(self):
         from statistics import mean
 
         class NoGroupField(Accuracy):
-            reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+            reduction_map = {"group_mean": {"agg_func": ["mean", mean, True]}}
 
         with self.assertRaises(ValueError):
             # should raise error because no grouping_field
@@ -518,7 +521,7 @@ class NoAggFuncReduction(Accuracy):
                 default_factory=lambda: ["mean", "group_mean", "some_other_func"]
             )
             grouping_field = "group_id"
-            reduction_map = {"some_other_func": {"agg_func": ["mean", mean]}}
+            reduction_map = {"some_other_func": {"agg_func": ["mean", mean, False]}}
 
         with self.assertRaises(ValueError):
             # should raise error because no aggregation_function will be defined, since only mean and group_mean are implemented
@@ -546,7 +549,7 @@ class NoAggFunc(Accuracy):
 
         class NoCallableAggFunc(Accuracy):
             grouping_field = "group_id"
-            reduction_map = {"group_mean": {"agg_func": ["mean", "some string"]}}
+            reduction_map = {"group_mean": {"agg_func": ["mean", "some string", False]}}
 
         with self.assertRaises(AssertionError):
             # should raise error because second field of agg_func should be callable
@@ -560,7 +563,7 @@ class NoCallableAggFunc(Accuracy):
 
         class WrongGroupID(Accuracy):
             grouping_field = "random_id_name"
-            reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+            reduction_map = {"group_mean": {"agg_func": ["mean", mean, False]}}
 
         with self.assertRaises(ValueError):
             # should raise error because grouping_field is not found in the additional inputs
@@ -572,6 +575,20 @@ class WrongGroupID(Accuracy):
                 additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
             )
 
+        class NoBooleanGrouping(Accuracy):
+            grouping_field = "group_id"
+            reduction_map = {"group_mean": {"agg_func": ["mean", mean, 1]}}
+
+        with self.assertRaises(AssertionError):
+            # should raise error because third field in agg_func is not boolean
+            metric = NoBooleanGrouping()
+            apply_metric(
+                metric=metric,
+                predictions=GROUPED_INSTANCE_PREDICTIONS,
+                references=GROUPED_INSTANCE_REFERENCES,
+                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
+            )
+
 
 class TestConfidenceIntervals(unittest.TestCase):
     def test_confidence_interval_off(self):
@@ -665,6 +682,12 @@ def _test_confidence_interval(self, metric, expected_ci_low, expected_ci_high):
 
     def test_grouped_instance_metric_confidence_interval(self):
         """Test the calculation of confidence intervals for grouped instance metrics (sub-types of InstanceMetric with group_mean reduction)."""
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupMeanAccuracy(),
+            expected_ci_low=0.1,
+            expected_ci_high=0.48178555627359004,
+        )
+
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanAccuracy(),
             expected_ci_low=0.025,
@@ -679,22 +702,22 @@ def test_grouped_instance_metric_confidence_interval(self):
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupPDRAccuracy(),
-            expected_ci_low=0.0,
+            expected_ci_low=0.6666666666666666,
             expected_ci_high=1.0,
             reduction_name="group_mean_subgroup_comparison",
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupPDRStringContainment(),
-            expected_ci_low=0.0,
-            expected_ci_high=1.0,
+            expected_ci_low=0.3333333333333333,
+            expected_ci_high=0.5,
             reduction_name="group_mean_subgroup_comparison",
         )
 
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHAccuracy(),
             expected_ci_low=-1.0,
-            expected_ci_high=0.5000000000000001,
+            expected_ci_high=0.33333333333333337,
             reduction_name="group_mean_subgroup_comparison",
         )
 
@@ -702,8 +725,8 @@ def test_grouped_instance_metric_confidence_interval(self):
         # hence this test may fail on a PC
         self._test_grouped_instance_confidence_interval(
             metric=GroupNormCohensHStringContainment(),
-            expected_ci_low=-1.0,
-            expected_ci_high=0.0,
+            expected_ci_low=-0.49999999999999994,
+            expected_ci_high=-0.39182655203060723,
             reduction_name="group_mean_subgroup_comparison",
         )
 
@@ -732,16 +755,14 @@ def _test_grouped_instance_confidence_interval(
         metric,
         expected_ci_low=0.0,
         expected_ci_high=1.0,
-        references=GROUPED_INSTANCE_REFERENCES,
-        predictions=GROUPED_INSTANCE_PREDICTIONS,
         expected_global_result=None,
         reduction_name="group_mean",
     ):
         """Test the calculation of confidence intervals for a given metric with group_mean reduction."""
         outputs = apply_metric(
             metric=metric,
-            predictions=predictions,
-            references=references,
+            predictions=GROUPED_INSTANCE_PREDICTIONS,
+            references=GROUPED_INSTANCE_REFERENCES,
             additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
         )
 

From 2db655099bf3dc206f57a47d7999e56d5ef0d72e Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 29 Jan 2024 23:01:32 +0200
Subject: [PATCH 36/83] class InstanceMetric can have group reductions done
 either taking the groups as fixed, or not

---
 src/unitxt/metrics.py | 336 +++++++++++++++++++++++++++++-------------
 1 file changed, 236 insertions(+), 100 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 7082dd10dc..250facfbd4 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -3,6 +3,7 @@
 import uuid
 from abc import ABC, abstractmethod
 from collections import Counter
+from copy import deepcopy
 from dataclasses import field
 from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -41,6 +42,18 @@ def abstract_field():
     return field(default_factory=abstract_factory)
 
 
+def nan_mean(x):
+    import warnings
+
+    with warnings.catch_warnings():
+        # final mean should be mean of scores, ignoring NaN, hence nanmean
+        # but if the group function values is NaN for ALL values, nanmean throws a
+        # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
+        # this is the desired behavior, but we want to avoid the warning here
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        return np.nanmean(x)
+
+
 class UpdateStream(StreamInstanceOperator):
     update: dict
 
@@ -84,14 +97,14 @@ def _can_compute_confidence_intervals(self, num_predictions):
         )
 
     @staticmethod
-    def average_instance_scores(instances, field_name):
-        """Calculate mean of a set of instance scores (given by field_name).
+    def average_instance_scores(instances: List, field_name: str):
+        """Calculate mean of a set of instance scores (given by field_name), omitting NaN values.
 
         Args:
             instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
             field_name: score field names to compute mean for.
         """
-        return mean(
+        return nan_mean(
             [instance["score"]["instance"][field_name] for instance in instances]
         )
 
@@ -123,15 +136,19 @@ def score_based_confidence_interval(
 
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
+
         ci_score_prefix = str(ci_score_prefix)
         if aggregation_func is None:
             aggregation_func = self.average_instance_scores
-
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
+
+                # if aggregation_func is None, we simply take the mean of the resampled instance scores
+                # otherwise, the aggregation_func needs to be applied AFTER resampling the instances;
+                #   that is, re-form the groups, calculate the function, and take the mean of the group scores
                 scores = numpy.apply_along_axis(
                     lambda resampled_instances: aggregation_func(
                         resampled_instances, score_name
@@ -139,7 +156,6 @@ def statistic(arr, axis, score_name=score_name):
                     axis=axis,
                     arr=arr,
                 )
-
                 return self.resample_from_non_nan(scores)
 
             # apply bootstrap only on the relevant field
@@ -421,20 +437,24 @@ def compute(
 
 
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
+    """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
+
+    InstanceMetric currently allows three reductions:
+    1. 'mean', which calculates the mean of instance scores,'
+    2. 'group_mean', which first applies an aggregation function specified in the reduction_map
+        to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
+        of the group scores; if grouping_field is None, grouping is disabled.
+        See _validate_group_mean_reduction for formatting instructions.
+    3. 'group_mean_subgroup_comparison': compare sub-groups (e.g. a baseline and others) within
+        groups, then return the mean of this function value.
+        See _validate_group_mean_subgroup_comparison_reduction for formatting instructions.
+    """
+
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
 
     implemented_reductions: List[str] = field(
         default_factory=lambda: ["mean", "group_mean", "group_mean_subgroup_comparison"]
     )
-
-    # InstanceMetric currently allows three reductions:
-    # 1. 'mean', which calculates the mean of instance scores,'
-    # 2. 'group_mean', which first applies an aggregation function specified in the reduction_map
-    #    to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
-    #    of the group scores; if grouping_field is None, grouping is disabled.
-    # 3. 'group_mean_subgroup_comparison': compare sub-groups (e.g. a baseline and others) within
-    #    groups, then return the mean of this function value.
-    # see _validate_group_mean_reduction for an example and proper formatting of the reduction_map
     grouping_field: str = None
 
     @property
@@ -449,14 +469,22 @@ def _validate_group_mean_reduction(self, reduction_name="group_mean"):
 
         class GroupVarianceAccuracy(Accuracy):
             grouping_field = 'group_id'
-            reduction_map = {'group_mean': {'agg_func': ['variance', np.var]}}
+            reduction_map = {'group_mean': {'agg_func': ['variance', np.var, True]}}
 
         reduction_map must be a dict with
-        - an 'agg_func' field with value being a 2-element list where
+        - an 'agg_func' field with value being a 3-element list where
             - 1st element is a string name of the aggregation function (used in naming the CI report)
             - 2nd element is the callable aggregation function
+            - 3rd element is a Boolean indicator of whether, during boostrap CI calculation, the groups are to be sampled as single units.
+                If True, the group scores are calculated and then resampled.  This treats the group units as the unit of
+                interest for which the CI is being compared.
+                If False, the instances are resampled individually, and the groups determined
+                (meaning the groups may be of slightly different size or composition from the original
+                depending on the resampling of the instances).
+                For group_mean_subgroup_comparison reduction (see _validate_group_mean_subgroup_comparison_reduction), it's
+                recommended to set it as 'True' to prevent the group score from being NaN too often.
         - Optional: 'score_fields' key with list value containing the string names of fields to apply the aggregation to
-            - if not present, the parent class main_score is used.
+            - If not present, the parent class main_score is used.
 
         Verify that instances do not contain a field _index, which is used to sort to make
         sure that resampling preserves the
@@ -475,19 +503,22 @@ class GroupVarianceAccuracy(Accuracy):
         assert isinstance(fields, dict)
         assert (
             "agg_func" in fields
-        ), "fields should have a key 'agg_func' whose value is a 2-element list of a function name and function definition"
+        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name and function definition"
         assert isinstance(
             fields["agg_func"], list
         ), "fields['agg_func'] should be a list"
         assert (
-            len(fields["agg_func"]) == 2
-        ), "fields['agg_func'] should be a two-element list"
+            len(fields["agg_func"]) == 3
+        ), "fields['agg_func'] should be a 3-element list"
         assert isinstance(
             fields["agg_func"][0], str
         ), "first item in fields['agg_func'] should be a string name of a function"
         assert callable(
             fields["agg_func"][1]
         ), "second item in fields['agg_func'] should be a callable function"
+        assert isinstance(
+            fields["agg_func"][2], bool
+        ), "third item in fields['agg_func'] should be a boolean value"
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
@@ -503,16 +534,19 @@ def _validate_group_mean_subgroup_comparison_reduction(self):
         the average of all of them) but rather wants to make a comparison between the 'other' and 'baseline' scores.
         For instance, return the difference between the baseline score and the average of the 'others' score.
 
-        This reduction must have the same format as group_mean reduction map (see validate_group_mean_reduction) in
-        terms of the 'agg_func' field, but must also have an additional field called 'baseline'.
-        This field value is a list of two strings, the first a name of a column in the input dataset (fed into additional_inputs)
-        and the second the value in that column that indicates the baseline items.
+        This reduction must have the same format as group_mean reduction map (see _validate_group_mean_reduction) in
+        terms of the 'agg_func' field; as noted in _validate_group_mean_reduction, the 3rd Boolean
+        element is recommended to be set as True to enforce group-level resampling in bootstrapping.
+
+        The reduction must also have an additional field called 'baseline'.  This field value is a list of two strings,
+        1. the first a name of a column in the input dataset (fed into additional_inputs)
+        2. the second the cell value in that column that indicates the baseline items.
         The callable function must accept parameters baseline_scores and other_scores.  An example is
 
         class GroupVsBaselineDiffAccuracy(Accuracy):
             grouping_field = 'group_id'
-            reduction_map = {'group_mean_subgroup_comparison': {'agg_func': ['accuracy_diff', accuracy_diff],
-                                                                        'subgroups': ['variant_type', 'original']}
+            reduction_map = {'group_mean_subgroup_comparison': {'agg_func': ['accuracy_diff', accuracy_diff, True],
+                                                                'subgroups': ['variant_type', 'original']}
                                                                 }
         # where the function is defined as
         def accuracy_diff(baseline_scores, other_scores):
@@ -559,6 +593,7 @@ def accuracy_diff(baseline_scores, other_scores):
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
+        from copy import deepcopy
 
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
@@ -566,9 +601,11 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
 
             field_name_full_prefix = ""
+            # used for passing to the bootstrapping, depends on whether the groups are fixed or not
+            aggregation_function = self.average_instance_scores
             if reduction_type == "mean":
-                aggregation_func = self.average_instance_scores
                 reduction_fields = list(set(reduction_params))
+                instances_to_resample = deepcopy(instances)
             elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction()
                 reduction_fields = (
@@ -578,18 +615,15 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 )
                 aggregation_function_name = str(reduction_params["agg_func"][0])
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
-
-                def aggregation_func(
-                    instances,
-                    field_name,
-                    group_aggregation_func=reduction_params["agg_func"][1],
-                ):
-                    return self.aggregate_instance_scores_by_group(
-                        instances, field_name, group_aggregation_func
-                    )
+                (
+                    instances_to_resample,
+                    aggregation_function,
+                ) = self._set_up_group_mean_aggregation(
+                    instances, reduction_params, reduction_fields
+                )
             elif reduction_type == "group_mean_subgroup_comparison":
                 self._validate_group_mean_subgroup_comparison_reduction()
-                # same setup as group_mean reduction
+                # same initial setup as group_mean reduction
                 reduction_fields = (
                     [self.main_score]
                     if "score_fields" not in reduction_params
@@ -597,21 +631,13 @@ def aggregation_func(
                 )
                 aggregation_function_name = str(reduction_params["agg_func"][0])
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
-
-                def aggregation_func(
-                    instances,
-                    field_name,
-                    group_aggregation_func=reduction_params["agg_func"][1],
-                    subgroup_field=reduction_params["subgroups"][0],
-                    baseline_name=reduction_params["subgroups"][1],
-                ):
-                    return self.aggregate_instance_scores_by_group_subgroups(
-                        instances,
-                        field_name,
-                        group_aggregation_func,
-                        subgroup_field=subgroup_field,
-                        baseline_name=baseline_name,
-                    )
+                # set up the aggregation function and the input to the confidence intervals
+                (
+                    instances_to_resample,
+                    aggregation_function,
+                ) = self._set_up_group_mean_subgroup_comparison_aggregation(
+                    instances, reduction_params, reduction_fields
+                )
             else:
                 raise ValueError(
                     f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
@@ -620,7 +646,16 @@ def aggregation_func(
             # calculate global scores for each reduction field
             for field_name in reduction_fields:
                 field_name_full = field_name_full_prefix + field_name
-                global_score[field_name_full] = aggregation_func(instances, field_name)
+                # if group resampling (3rd element of agg_func parameter) is True, then
+                #   1. instances_to_resample are the group scores, and
+                #   2. aggregation_function is to take the raw mean
+                # if no group resampling (3rd element of agg_func parameter) is False, then
+                #   1. instances_to_resample are the original instance scores, and
+                #   2. aggregation_function is to apply the group aggregation from the instance scores
+                # either way, the application of aggregation_function to instances_to_resample yields the global score
+                global_score[field_name_full] = aggregation_function(
+                    instances_to_resample, field_name
+                )
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
                     global_score["score_name"] = field_name_full
@@ -629,10 +664,10 @@ def aggregation_func(
             # (will not automatically calculate CIs for fields in reduction map)
             if self.ci_scores is not None:
                 confidence_interval = self.score_based_confidence_interval(
-                    instances=instances,
-                    aggregation_func=aggregation_func,
+                    instances_to_resample,
                     score_names=list(set(self.ci_scores)),
                     ci_score_prefix=field_name_full_prefix,
+                    aggregation_func=aggregation_function,
                 )
                 global_score.update(confidence_interval)
 
@@ -666,12 +701,23 @@ def compute_instance_scores(
 
         return instances, global_score
 
-    def aggregate_instance_scores_by_group(
-        self, instances, field_name, group_aggregation_func
+    def get_group_aggregated_instance_scores(
+        self, instances: List[dict], field_names: List[str], group_aggregation_func
     ):
+        """Return a list of group aggregation function value for group_mean reduction.
+
+        Args:
+            instances: List of observation instances with instance-level scores (fields) computed.
+            field_names: List of instance score names in each instance to apply the aggregation function.
+            group_aggregation_func: Callable aggregation function accepting a list of numeric scores and returning a single score .
+
+        Returns:
+            List of dicts, each corresponding to a group of instances (defined by grouping_field)
+        """
         from collections import defaultdict
 
-        group_to_instance_scores = defaultdict(list)
+        # two-level defaultdict: first is the grouping, second is the field name
+        group_to_instance_scores = defaultdict(lambda: defaultdict(list))
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
             if self.grouping_field not in additional_inputs:
@@ -679,39 +725,52 @@ def aggregate_instance_scores_by_group(
                     f"Missing '{self.grouping_field}' from instance {instance}. "
                     f"This field is required for group based metric computation."
                 )
-            group_key = additional_inputs[
-                self.grouping_field
-            ]  # do we need to convert to str?
-            group_to_instance_scores[group_key].append(
-                instance["score"]["instance"][field_name]
-            )
+            group_key = additional_inputs[self.grouping_field]
+            for field_name in field_names:
+                group_to_instance_scores[group_key][field_name].append(
+                    instance["score"]["instance"][field_name]
+                )
 
-        group_total_scores = [
-            group_aggregation_func(scores)
+        # a list where each element is a group (not an instance), and the dict corresponds to
+        # the aggregate scores of each field within that group
+        return [
+            {
+                "score": {
+                    "instance": {
+                        field_name: group_aggregation_func(scores[field_name])
+                        for field_name in field_names
+                    }
+                }
+            }
             for scores in group_to_instance_scores.values()
         ]
-        import warnings
-
-        with warnings.catch_warnings():
-            # final mean should be mean of group_total_score, ignoring NaN, hence nanmean
-            # but if the group function values is NaN for ALL groups, nanmean throws a
-            # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
-            # this is the desired behavior, but we want to avoid the warning here
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            return np.nanmean(group_total_scores)
 
-    def aggregate_instance_scores_by_group_subgroups(
+    def get_group_aggregated_instance_scores_by_subgroups(
         self,
-        instances,
-        field_name,
+        instances: List[dict],
+        field_names: List[str],
         group_aggregation_func,
-        subgroup_field,
+        subgroup_field: str,
         baseline_name,
     ):
+        """Return a list of group aggregation function value for group_mean_subgroup_comparison reduction.
+
+        Args:
+            instances: List of observation instances with instance-level scores (fields) computed.
+            field_names: List of instance score names in each instance to apply the aggregation function.
+            group_aggregation_func: Callable aggregation function with arguments baseline_scores and other_scores.
+            subgroup_field: name of field in instance additional_inputs that contains the subgroup identifier
+            baseline_name: value of subgroup_field that indicates a score belongs in baseline_scores.
+
+        Returns:
+            List of dicts, each corresponding to a group of instances (defined by grouping_field)
+        """
         from collections import defaultdict
 
+        # two-level defaultdict: first is the grouping, second is the field name
         # first list is instance scores for baseline group, second is for comparison group
-        group_to_instance_scores = defaultdict(lambda: [[], []])
+        group_to_instance_scores = defaultdict(lambda: defaultdict(lambda: [[], []]))
+
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
             for cc in [self.grouping_field, subgroup_field]:
@@ -725,24 +784,95 @@ def aggregate_instance_scores_by_group_subgroups(
             is_baseline = str(additional_inputs[subgroup_field]) == baseline_name
             # convert True (baseline) to 0, and False (others) to 1, store in respective groups
             idx = int(not is_baseline)
-            group_to_instance_scores[group_key][idx].append(
-                instance["score"]["instance"][field_name]
-            )
+
+            for field_name in field_names:
+                group_to_instance_scores[group_key][field_name][idx].append(
+                    instance["score"]["instance"][field_name]
+                )
 
         # now for each group, take the aggregation function, comparing others to baseline
-        group_total_scores = [
-            group_aggregation_func(baseline_scores=scores[0], other_scores=scores[1])
+        return [
+            {
+                "score": {
+                    "instance": {
+                        field_name: group_aggregation_func(
+                            baseline_scores=scores[field_name][0],
+                            other_scores=scores[field_name][1],
+                        )
+                        for field_name in field_names
+                    }
+                }
+            }
             for scores in group_to_instance_scores.values()
         ]
-        import warnings
 
-        with warnings.catch_warnings():
-            # final mean should be mean of group_total_score, ignoring NaN, hence nanmean
-            # but if the group function values is NaN for ALL groups, nanmean throws a
-            # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
-            # this is the desired behavior, but we want to avoid the warning here
-            warnings.simplefilter("ignore", category=RuntimeWarning)
-            return np.nanmean(group_total_scores)
+    def _set_up_group_mean_aggregation(
+        self, instances, reduction_params, reduction_fields
+    ):
+        # if treat groups as units
+        if reduction_params["agg_func"][2]:
+            # pass the group aggregate---not instance---scores to resample as usual
+            aggregation_function = self.average_instance_scores
+            instances_to_resample = self.get_group_aggregated_instance_scores(
+                instances, reduction_fields, reduction_params["agg_func"][1]
+            )
+        else:
+            # pass the instance scores to resample, and calculate the group aggregation on the resamplings
+            instances_to_resample = deepcopy(instances)
+
+            def aggregation_function(
+                instances,
+                field_name,
+                group_aggregation_func=reduction_params["agg_func"][1],
+            ):
+                group_scores = self.get_group_aggregated_instance_scores(
+                    instances, [field_name], group_aggregation_func
+                )
+                return nan_mean(
+                    [group["score"]["instance"][field_name] for group in group_scores]
+                )
+
+        return instances_to_resample, aggregation_function
+
+    def _set_up_group_mean_subgroup_comparison_aggregation(
+        self, instances, reduction_params, reduction_fields
+    ):
+        # if treat groups as units
+        if reduction_params["agg_func"][2]:
+            # pass the group aggregate---not instance---scores to resample as usual
+            aggregation_function = self.average_instance_scores
+            instances_to_resample = (
+                self.get_group_aggregated_instance_scores_by_subgroups(
+                    instances,
+                    reduction_fields,
+                    group_aggregation_func=reduction_params["agg_func"][1],
+                    subgroup_field=reduction_params["subgroups"][0],
+                    baseline_name=reduction_params["subgroups"][1],
+                )
+            )
+        else:
+            # pass the instance scores to resample, and calculate the group aggregation on the resamplings
+            instances_to_resample = deepcopy(instances)
+
+            def aggregation_function(
+                instances,
+                field_name,
+                group_aggregation_func=reduction_params["agg_func"][1],
+                subgroup_field=reduction_params["subgroups"][0],
+                baseline_name=reduction_params["subgroups"][1],
+            ):
+                group_scores = self.get_group_aggregated_instance_scores_by_subgroups(
+                    instances,
+                    [field_name],
+                    group_aggregation_func,
+                    subgroup_field=subgroup_field,
+                    baseline_name=baseline_name,
+                )
+                return nan_mean(
+                    [group["score"]["instance"][field_name] for group in group_scores]
+                )
+
+        return instances_to_resample, aggregation_function
 
     @abstractmethod
     def compute(
@@ -2151,14 +2281,20 @@ def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=Fal
 
 class GroupMeanAccuracy(Accuracy):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["mean", mean]}}
+    reduction_map = {"group_mean": {"agg_func": ["mean", mean, False]}}
+
+
+class FixedGroupMeanAccuracy(Accuracy):
+    # the same as GroupMeanAccuracy, except the groups are fixed and are resampled together
+    grouping_field = "group_id"
+    reduction_map = {"group_mean": {"agg_func": ["fixed_group_mean", mean, True]}}
 
 
 class GroupPDRAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean_subgroup_comparison": {
-            "agg_func": ["pdr", performance_drop_rate],
+            "agg_func": ["pdr", performance_drop_rate, True],
             "subgroups": ["variant_type", "original"],
         }
     }
@@ -2166,14 +2302,14 @@ class GroupPDRAccuracy(Accuracy):
 
 class GroupMeanStringContainment(StringContainment):
     grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["mean", np.nanmean]}}
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
 
 
 class GroupPDRStringContainment(StringContainment):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean_subgroup_comparison": {
-            "agg_func": ["pdr", performance_drop_rate],
+            "agg_func": ["pdr", performance_drop_rate, True],
             "subgroups": ["variant_type", "original"],
         }
     }
@@ -2183,7 +2319,7 @@ class GroupMeanTokenOverlap(TokenOverlap):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean", np.nanmean],
+            "agg_func": ["mean", nan_mean, False],
             "score_fields": ["f1", "precision", "recall"],
         }
     }
@@ -2193,7 +2329,7 @@ class GroupNormCohensHAccuracy(Accuracy):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean_subgroup_comparison": {
-            "agg_func": ["norm_cohens_h", normalized_cohens_h],
+            "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
             "subgroups": ["variant_type", "original"],
         }
     }
@@ -2203,7 +2339,7 @@ class GroupNormCohensHStringContainment(StringContainment):
     grouping_field = "group_id"
     reduction_map = {
         "group_mean_subgroup_comparison": {
-            "agg_func": ["norm_cohens_h", normalized_cohens_h],
+            "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
             "subgroups": ["variant_type", "original"],
         }
     }

From e718694d231df4b51f7e11dd5e1847d062ce3f9d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 29 Jan 2024 23:02:42 +0200
Subject: [PATCH 37/83] add FixedGroupMeanAccuracy.  Modify expected global
 results to take into account grouping.

---
 prepare/metrics/grouped_instance_metrics.py | 51 +++++++++++++++------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 9b87dd6a44..46f2d0d9d9 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -2,6 +2,7 @@
 
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
+    FixedGroupMeanAccuracy,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
@@ -120,6 +121,30 @@
 for instance in instance_targets_accuracy:
     instance.update({"accuracy": instance["score"], "score_name": "accuracy"})
 
+metric = FixedGroupMeanAccuracy()
+global_target = {
+    "group_fixed_group_mean_accuracy": 0.22,
+    "score": 0.22,
+    "score_name": "group_fixed_group_mean_accuracy",
+    "score_ci_low": 0.1,
+    "score_ci_high": 0.48,
+    "group_fixed_group_mean_accuracy_ci_low": 0.1,
+    "group_fixed_group_mean_accuracy_ci_high": 0.48,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.group_fixed_group_mean_accuracy", overwrite=True)
+
+
 metric = GroupMeanAccuracy()
 global_target = {
     "group_mean_accuracy": 0.22,
@@ -174,9 +199,9 @@
     "group_pdr_accuracy": 0.83,
     "score": 0.83,
     "score_name": "group_pdr_accuracy",
-    "score_ci_low": 0.0,
+    "score_ci_low": 0.67,
     "score_ci_high": 1.0,
-    "group_pdr_accuracy_ci_low": 0.0,
+    "group_pdr_accuracy_ci_low": 0.67,
     "group_pdr_accuracy_ci_high": 1.0,
 }
 
@@ -198,10 +223,10 @@
     "group_pdr_string_containment": 0.44,
     "score": 0.44,
     "score_name": "group_pdr_string_containment",
-    "score_ci_low": 0.0,
-    "score_ci_high": 1.0,
-    "group_pdr_string_containment_ci_low": 0.0,
-    "group_pdr_string_containment_ci_high": 1.0,
+    "score_ci_low": 0.33,
+    "score_ci_high": 0.5,
+    "group_pdr_string_containment_ci_low": 0.33,
+    "group_pdr_string_containment_ci_high": 0.5,
 }
 
 
@@ -216,16 +241,16 @@
 
 add_to_catalog(metric, "metrics.group_pdr_string_containment", overwrite=True)
 
-
+# Cohen's H
 metric = GroupNormCohensHAccuracy()
 global_target = {
     "group_norm_cohens_h_accuracy": -0.42,
     "score": -0.42,
     "score_name": "group_norm_cohens_h_accuracy",
     "score_ci_low": -1.0,
-    "score_ci_high": 0.5,
+    "score_ci_high": 0.33,
     "group_norm_cohens_h_accuracy_ci_low": -1.0,
-    "group_norm_cohens_h_accuracy_ci_high": 0.5,
+    "group_norm_cohens_h_accuracy_ci_high": 0.33,
 }
 
 
@@ -246,10 +271,10 @@
     "group_norm_cohens_h_string_containment": -0.46,
     "score": -0.46,
     "score_name": "group_norm_cohens_h_string_containment",
-    "score_ci_low": -1.0,
-    "score_ci_high": 0.0,
-    "group_norm_cohens_h_string_containment_ci_low": -1.0,
-    "group_norm_cohens_h_string_containment_ci_high": 0.0,
+    "score_ci_low": -0.5,
+    "score_ci_high": -0.39,
+    "group_norm_cohens_h_string_containment_ci_low": -0.5,
+    "group_norm_cohens_h_string_containment_ci_high": -0.39,
 }
 
 

From 456848b73f7a2412505bbaf282d5cb391c93e5ba Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 30 Jan 2024 11:55:04 +0200
Subject: [PATCH 38/83] add notes to cohen's h

---
 src/unitxt/metrics.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 250facfbd4..2842c446a9 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2226,7 +2226,7 @@ def performance_drop_rate(baseline_scores: List, other_scores: List):
 
 
 def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=False):
-    """Cohen's h between two proportions, normalized to interval [-1,1].
+    """Cohen's h effect size between two proportions, normalized to interval [-1,1].
 
     Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
     https://en.wikipedia.org/wiki/Cohen%27s_h
@@ -2238,12 +2238,17 @@ def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=Fal
     Calculates the change in the average of the other_scores relative to the average of the baseline_scores.    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
 
     Interpretation: the original unscaled Cohen's h can be interpreted as
+        - no difference if |h| = 0
         - an insignificant difference if 0 < |h| < 0.2
         - small difference if 0.2 <= |h| < 0.5
         - a medium difference if 0.5 <= |h| < 0.8
         - a large difference if 0.8 <= |h|
     Thus, the rule of interpreting the effect of the normalized value is to use the same thresholds divided by pi
-
+        - no difference if |norm h| = 0
+        - an insignificant difference if 0 < |norm h| < 0.06366198
+        - small difference if 0.06366198 <= |norm h| < 0.15915494
+        - a medium difference if 0.15915494 <= |norm h| < 0.25464791
+        - a large difference if 0.25464791 <= |norm h|
     Args:
         baseline_scores: a list of scores on baseline instances.
         other_scores: a list of scores on instances that will be compared to the baseline.

From ed66ed7f51ba49b9a8b89620b1a09f5d8730c80d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 31 Jan 2024 22:41:12 +0200
Subject: [PATCH 39/83] add other_mean and baseline_mean functions.  Combine
 the subgroup_comparison reduction into the group_mean.  Any metric that uses
 fixed group sampling renamed FixedGroup...

---
 prepare/metrics/grouped_instance_metrics.py | 227 ++++++++--
 src/unitxt/metrics.py                       | 472 +++++++++-----------
 tests/test_metrics.py                       | 154 ++++---
 3 files changed, 481 insertions(+), 372 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 46f2d0d9d9..2c7c5da041 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -1,15 +1,22 @@
 from copy import deepcopy
 
+import numpy as np
+
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
     FixedGroupMeanAccuracy,
+    FixedGroupMeanBaselineAccuracy,
+    FixedGroupMeanBaselineStringContainment,
+    FixedGroupMeanOthersAccuracy,
+    FixedGroupMeanOthersStringContainment,
+    FixedGroupMeanStringContainment,
+    FixedGroupNormCohensHAccuracy,
+    FixedGroupNormCohensHStringContainment,
+    FixedGroupPDRAccuracy,
+    FixedGroupPDRStringContainment,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
-    GroupNormCohensHAccuracy,
-    GroupNormCohensHStringContainment,
-    GroupPDRAccuracy,
-    GroupPDRStringContainment,
 )
 from src.unitxt.test_utils.metrics import test_metric
 
@@ -57,23 +64,24 @@
     + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
     + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
 )
-# for group_mean_subgroup_comparison metrics, add a subgroup indicator (by default called 'variant_type')
+# for group_mean aggregations with a subgroup_comparison, add a baseline indicator
 # these groupings correspond in length to the group identifiers above
-variant_type = (
-    (["original"] + ["paraphrase"] * 4)
-    + (["original"] + ["paraphrase"] * 4)
-    + (["original"] + ["paraphrase"] * 3)
-    + ["original"]
-)
-
+is_baseline = np.concatenate(
+    (
+        np.repeat(a=[True, False], repeats=[1, 4]),
+        np.repeat(a=[True, False], repeats=[1, 4]),
+        np.repeat(a=[True, False], repeats=[1, 3]),
+        np.repeat(a=[True, False], repeats=[1, 0]),
+    )
+).tolist()
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
 group_by_fields = ["group", "id"]
 
-for ai, vt in zip(additional_inputs, variant_type):
+for ai, ib in zip(additional_inputs, is_baseline):
     ai.update(
         {
             "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "variant_type": vt,
+            "is_baseline": ib,
         }
     )
 
@@ -121,18 +129,20 @@
 for instance in instance_targets_accuracy:
     instance.update({"accuracy": instance["score"], "score_name": "accuracy"})
 
+# now test the metrics
+# group mean accuracy, fixed and not
+
 metric = FixedGroupMeanAccuracy()
 global_target = {
-    "group_fixed_group_mean_accuracy": 0.22,
+    "fixed_group_mean_accuracy": 0.22,
     "score": 0.22,
-    "score_name": "group_fixed_group_mean_accuracy",
+    "score_name": "fixed_group_mean_accuracy",
     "score_ci_low": 0.1,
     "score_ci_high": 0.48,
-    "group_fixed_group_mean_accuracy_ci_low": 0.1,
-    "group_fixed_group_mean_accuracy_ci_high": 0.48,
+    "fixed_group_mean_accuracy_ci_low": 0.1,
+    "fixed_group_mean_accuracy_ci_high": 0.48,
 }
 
-
 outputs = test_metric(
     metric=metric,
     predictions=predictions,
@@ -142,7 +152,7 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_fixed_group_mean_accuracy", overwrite=True)
+add_to_catalog(metric, "metrics.fixed_group_mean_accuracy", overwrite=True)
 
 
 metric = GroupMeanAccuracy()
@@ -168,6 +178,31 @@
 
 add_to_catalog(metric, "metrics.group_mean_accuracy", overwrite=True)
 
+# group mean string containment, fixed and not
+
+metric = FixedGroupMeanStringContainment()
+global_target = {
+    "fixed_group_mean_string_containment": 0.49,
+    "score": 0.49,
+    "score_name": "fixed_group_mean_string_containment",
+    "score_ci_low": 0.0,
+    "score_ci_high": 0.68,
+    "fixed_group_mean_string_containment_ci_low": 0.0,
+    "fixed_group_mean_string_containment_ci_high": 0.68,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.fixed_group_mean_string_containment", overwrite=True)
+
 
 metric = GroupMeanStringContainment()
 global_target = {
@@ -193,16 +228,111 @@
 add_to_catalog(metric, "metrics.group_mean_string_containment", overwrite=True)
 
 
-# PDR
-metric = GroupPDRAccuracy()
+# Group mean of baseline or other scores
+metric = FixedGroupMeanBaselineAccuracy()
 global_target = {
-    "group_pdr_accuracy": 0.83,
+    "fixed_group_mean_baseline_accuracy": 0.5,
+    "score": 0.5,
+    "score_name": "fixed_group_mean_baseline_accuracy",
+    "score_ci_low": 0.0,
+    "score_ci_high": 1.0,
+    "fixed_group_mean_baseline_accuracy_ci_low": 0.0,
+    "fixed_group_mean_baseline_accuracy_ci_high": 1.0,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.fixed_group_mean_baseline_accuracy", overwrite=True)
+
+metric = FixedGroupMeanOthersAccuracy()
+global_target = {
+    "fixed_group_mean_others_accuracy": 0.19,
+    "score": 0.19,
+    "score_name": "fixed_group_mean_others_accuracy",
+    "score_ci_low": 0.0,
+    "score_ci_high": 0.33,
+    "fixed_group_mean_others_accuracy_ci_low": 0.0,
+    "fixed_group_mean_others_accuracy_ci_high": 0.33,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(metric, "metrics.fixed_group_mean_others_accuracy", overwrite=True)
+
+
+metric = FixedGroupMeanBaselineStringContainment()
+global_target = {
+    "fixed_group_mean_baseline_string_containment": 0.75,
+    "score": 0.75,
+    "score_name": "fixed_group_mean_baseline_string_containment",
+    "score_ci_low": 0.25,
+    "score_ci_high": 1.0,
+    "fixed_group_mean_baseline_string_containment_ci_low": 0.25,
+    "fixed_group_mean_baseline_string_containment_ci_high": 1.0,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric, "metrics.fixed_group_mean_baseline_string_containment", overwrite=True
+)
+
+metric = FixedGroupMeanOthersStringContainment()
+global_target = {
+    "fixed_group_mean_others_string_containment": 0.56,
+    "score": 0.56,
+    "score_name": "fixed_group_mean_others_string_containment",
+    "score_ci_low": 0.5,
+    "score_ci_high": 0.67,
+    "fixed_group_mean_others_string_containment_ci_low": 0.5,
+    "fixed_group_mean_others_string_containment_ci_high": 0.67,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric, "metrics.fixed_group_mean_others_string_containment", overwrite=True
+)
+
+
+# PDR: will always use fixed groups
+metric = FixedGroupPDRAccuracy()
+global_target = {
+    "fixed_group_pdr_accuracy": 0.83,
     "score": 0.83,
-    "score_name": "group_pdr_accuracy",
+    "score_name": "fixed_group_pdr_accuracy",
     "score_ci_low": 0.67,
     "score_ci_high": 1.0,
-    "group_pdr_accuracy_ci_low": 0.67,
-    "group_pdr_accuracy_ci_high": 1.0,
+    "fixed_group_pdr_accuracy_ci_low": 0.67,
+    "fixed_group_pdr_accuracy_ci_high": 1.0,
 }
 
 
@@ -215,18 +345,17 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_pdr_accuracy", overwrite=True)
-
+add_to_catalog(metric, "metrics.fixed_group_pdr_accuracy", overwrite=True)
 
-metric = GroupPDRStringContainment()
+metric = FixedGroupPDRStringContainment()
 global_target = {
-    "group_pdr_string_containment": 0.44,
+    "fixed_group_pdr_string_containment": 0.44,
     "score": 0.44,
-    "score_name": "group_pdr_string_containment",
+    "score_name": "fixed_group_pdr_string_containment",
     "score_ci_low": 0.33,
     "score_ci_high": 0.5,
-    "group_pdr_string_containment_ci_low": 0.33,
-    "group_pdr_string_containment_ci_high": 0.5,
+    "fixed_group_pdr_string_containment_ci_low": 0.33,
+    "fixed_group_pdr_string_containment_ci_high": 0.5,
 }
 
 
@@ -239,18 +368,18 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_pdr_string_containment", overwrite=True)
+add_to_catalog(metric, "metrics.fixed_group_pdr_string_containment", overwrite=True)
 
-# Cohen's H
-metric = GroupNormCohensHAccuracy()
+# Cohen's H will always use fixed groups
+metric = FixedGroupNormCohensHAccuracy()
 global_target = {
-    "group_norm_cohens_h_accuracy": -0.42,
+    "fixed_group_norm_cohens_h_accuracy": -0.42,
     "score": -0.42,
-    "score_name": "group_norm_cohens_h_accuracy",
+    "score_name": "fixed_group_norm_cohens_h_accuracy",
     "score_ci_low": -1.0,
     "score_ci_high": 0.33,
-    "group_norm_cohens_h_accuracy_ci_low": -1.0,
-    "group_norm_cohens_h_accuracy_ci_high": 0.33,
+    "fixed_group_norm_cohens_h_accuracy_ci_low": -1.0,
+    "fixed_group_norm_cohens_h_accuracy_ci_high": 0.33,
 }
 
 
@@ -263,18 +392,18 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_norm_cohens_h_accuracy", overwrite=True)
+add_to_catalog(metric, "metrics.fixed_group_norm_cohens_h_accuracy", overwrite=True)
 
 
-metric = GroupNormCohensHStringContainment()
+metric = FixedGroupNormCohensHStringContainment()
 global_target = {
-    "group_norm_cohens_h_string_containment": -0.46,
+    "fixed_group_norm_cohens_h_string_containment": -0.46,
     "score": -0.46,
-    "score_name": "group_norm_cohens_h_string_containment",
+    "score_name": "fixed_group_norm_cohens_h_string_containment",
     "score_ci_low": -0.5,
     "score_ci_high": -0.39,
-    "group_norm_cohens_h_string_containment_ci_low": -0.5,
-    "group_norm_cohens_h_string_containment_ci_high": -0.39,
+    "fixed_group_norm_cohens_h_string_containment_ci_low": -0.5,
+    "fixed_group_norm_cohens_h_string_containment_ci_high": -0.39,
 }
 
 
@@ -287,7 +416,11 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_norm_cohens_h_string_containment", overwrite=True)
+add_to_catalog(
+    metric, "metrics.fixed_group_norm_cohens_h_string_containment", overwrite=True
+)
+
+# TokenOverlap: example of a metric that has more than one score
 
 global_target = {
     "group_mean_f1": 0.51,
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 2842c446a9..6fad4d4816 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -97,11 +97,11 @@ def _can_compute_confidence_intervals(self, num_predictions):
         )
 
     @staticmethod
-    def average_instance_scores(instances: List, field_name: str):
+    def average_item_scores(instances: List[dict], field_name: str):
         """Calculate mean of a set of instance scores (given by field_name), omitting NaN values.
 
         Args:
-            instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
+            instances: list of dicts of each instance's instance scores.
             field_name: score field names to compute mean for.
         """
         return nan_mean(
@@ -139,7 +139,7 @@ def score_based_confidence_interval(
 
         ci_score_prefix = str(ci_score_prefix)
         if aggregation_func is None:
-            aggregation_func = self.average_instance_scores
+            aggregation_func = self.average_item_scores
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
             def statistic(arr, axis, score_name=score_name):
@@ -439,36 +439,39 @@ def compute(
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
 
-    InstanceMetric currently allows three reductions:
+    InstanceMetric currently allows two reductions:
     1. 'mean', which calculates the mean of instance scores,'
     2. 'group_mean', which first applies an aggregation function specified in the reduction_map
         to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
         of the group scores; if grouping_field is None, grouping is disabled.
         See _validate_group_mean_reduction for formatting instructions.
-    3. 'group_mean_subgroup_comparison': compare sub-groups (e.g. a baseline and others) within
-        groups, then return the mean of this function value.
-        See _validate_group_mean_subgroup_comparison_reduction for formatting instructions.
     """
 
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
 
     implemented_reductions: List[str] = field(
-        default_factory=lambda: ["mean", "group_mean", "group_mean_subgroup_comparison"]
+        default_factory=lambda: ["mean", "group_mean"]
     )
-    grouping_field: str = None
 
     @property
     @abstractmethod
     def reduction_map(self) -> dict:
         pass
 
-    def _validate_group_mean_reduction(self, reduction_name="group_mean"):
+    @staticmethod
+    def _function_does_comparison(aggregation_func):
+        import inspect
+
+        func_args = list(inspect.signature(aggregation_func).parameters.keys())
+        # if function has both these arguments, assume it does comparison (i.e. it expects both arguments)
+        return all(fa in func_args for fa in ["baseline_scores", "other_scores"])
+
+    def _validate_group_mean_reduction(self, instances: List[dict]):
         """Ensure that group_mean reduction_map is properly formatted.
 
         Example: Apply the variance (np.var) to group Accuracy instance scores.  This class would be specified as follows:
 
         class GroupVarianceAccuracy(Accuracy):
-            grouping_field = 'group_id'
             reduction_map = {'group_mean': {'agg_func': ['variance', np.var, True]}}
 
         reduction_map must be a dict with
@@ -481,29 +484,58 @@ class GroupVarianceAccuracy(Accuracy):
                 If False, the instances are resampled individually, and the groups determined
                 (meaning the groups may be of slightly different size or composition from the original
                 depending on the resampling of the instances).
-                For group_mean_subgroup_comparison reduction (see _validate_group_mean_subgroup_comparison_reduction), it's
-                recommended to set it as 'True' to prevent the group score from being NaN too often.
         - Optional: 'score_fields' key with list value containing the string names of fields to apply the aggregation to
             - If not present, the parent class main_score is used.
 
-        Verify that instances do not contain a field _index, which is used to sort to make
-        sure that resampling preserves the
+        The aggregation function (2nd element of agg_func) can be one of two types:
+        1. simple: calculate a summary statistic from a single group of values (e.g. mean, median, etc.).
+            This is best suited for cases where the instances are independent of each other, other than belonging to the same group
+        2. comparison: requires additional_inputs to have a boolean key 'is_baseline'.  This function conducts
+            a comparison between baseline instances (is_baseline=True) and other (is_baseline=False).
+            An example is where the baseline instance is a question, and the others are various paraphrases
+            or perturbations of this question.  Here, the function would return, say, a comparison of the instance accuracies
+            rather than, say, the average instance accuracy.  It requires arguments 'baseline_scores' and 'other_scores'
+            In these cases, we recommend setting the 3rd parameter to be True so that the groups are resampled together.
+
+        Example:
+            class GroupVsBaselineDiffAccuracy(Accuracy):
+                reduction_map = {'group_mean': {'agg_func': ['accuracy_diff', accuracy_diff, True],}}
+
+            # where the function is defined as
+            def accuracy_diff(baseline_scores, other_scores):
+                from statistics import mean
+                return mean(other_scores) - mean(baseline_scores)
+
+            The input dataset should look like:
+
+            'group_id'  'question'                                  'is_baseline'
+            1           'How do you fix a car engine?'               True
+            1           'What is the best way to fix an engine?'     False
+            1           'How do you repair a car engine?'            False
+            1           'How do I repair my engine?'                 False
+            2           'Why are ants eating my food?'               True
         """
-        if not self.grouping_field:
-            raise ValueError(
-                "self.grouping_field is None, . "
-                "This field is required for group based metric computation."
-            )
+        # instances need to all have additional_inputs field with field group_id
+        assert all(
+            "additional_inputs" in instance for instance in instances
+        ), "each instance must have an additional_inputs field"
+        assert all(
+            isinstance(instance["additional_inputs"], dict) for instance in instances
+        ), "each instance must have an additional_inputs field that is a dict"
+        assert all(
+            "group_id" in instance["additional_inputs"] for instance in instances
+        ), "each instance additional_inputs dict must have a key group_id"
 
+        # validate the reduction_map
         assert (
-            reduction_name in self.reduction_map
-        ), f"reduction_map must have a `{reduction_name}' key"
-        fields = self.reduction_map[reduction_name]
+            "group_mean" in self.reduction_map
+        ), "reduction_map must have a 'group_mean' key"
+        fields = self.reduction_map["group_mean"]
         # for group_mean, expects a dict
         assert isinstance(fields, dict)
         assert (
             "agg_func" in fields
-        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name and function definition"
+        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
         assert isinstance(
             fields["agg_func"], list
         ), "fields['agg_func'] should be a list"
@@ -522,74 +554,20 @@ class GroupVarianceAccuracy(Accuracy):
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
-    def _validate_group_mean_subgroup_comparison_reduction(self):
-        """Ensure that group_mean_subgroup_comparison reduction_map is properly formatted.
-
-        Example: given a set of Accuracy instance scores, where the instances are grouped by grouping_field.
-        Assume that the instances in each group belong to two sub-groups, 'baseline' and 'others'.
-        An example is that we have an original dataset consisting of questions to be answered.  A second dataset,
-        which we are evaluating, consists of the original question and multiple paraphrases or perturbations of it.
-        Thus, a group is defined by the original question (the 'baseline') plus the paraphrases ('others').
-        An aggregation function here does not simply receive as inputs the set of all the group's instance score (e.g.,
-        the average of all of them) but rather wants to make a comparison between the 'other' and 'baseline' scores.
-        For instance, return the difference between the baseline score and the average of the 'others' score.
-
-        This reduction must have the same format as group_mean reduction map (see _validate_group_mean_reduction) in
-        terms of the 'agg_func' field; as noted in _validate_group_mean_reduction, the 3rd Boolean
-        element is recommended to be set as True to enforce group-level resampling in bootstrapping.
-
-        The reduction must also have an additional field called 'baseline'.  This field value is a list of two strings,
-        1. the first a name of a column in the input dataset (fed into additional_inputs)
-        2. the second the cell value in that column that indicates the baseline items.
-        The callable function must accept parameters baseline_scores and other_scores.  An example is
-
-        class GroupVsBaselineDiffAccuracy(Accuracy):
-            grouping_field = 'group_id'
-            reduction_map = {'group_mean_subgroup_comparison': {'agg_func': ['accuracy_diff', accuracy_diff, True],
-                                                                'subgroups': ['variant_type', 'original']}
-                                                                }
-        # where the function is defined as
-        def accuracy_diff(baseline_scores, other_scores):
-            from statistics import mean
-            return mean(other_scores) - mean(baseline_scores)
-
-        The input dataset should look like:
-
-        'group_id'  'question'                                  'variant_type'
-        1           'How do you fix a car engine?'               original
-        1           'What is the best way to fix an engine?'     paraphrase
-        1           'How do you repair a car engine?'            paraphrase
-        1           'How do I repair my engine?'                 paraphrase
-        2           'Why are ants eating my food?'               original
-        ...
-
-        """
-        reduction_name = "group_mean_subgroup_comparison"
-        self._validate_group_mean_reduction(reduction_name=reduction_name)
-        # make sure aggregation function contains appropriate arguments
-        import inspect
-
-        agg_func = self.reduction_map[reduction_name]["agg_func"][1]
-        func_args = list(inspect.signature(agg_func).parameters.keys())
-        required_args = ["baseline_scores", "other_scores"]
-        assert all(
-            kk in func_args for kk in required_args
-        ), f"aggregation function {agg_func.__name} must accept parameters {required_args}"
-
-        # validate baseline arguments
-        fields = self.reduction_map[reduction_name]
-        assert (
-            "subgroups" in fields
-        ), "fields should have a key 'subgroups' whose value is a 2-element list of strings, a data column name and value identifier"
-        assert isinstance(
-            fields["subgroups"], list
-        ), "fields['subgroups'] should be a list"
-        assert (
-            len(fields["subgroups"]) == 2
-        ), "fields['subgroups'] should be a two-element list"
-        assert all(
-            isinstance(vv, str) for vv in fields["subgroups"]
-        ), "both elements in fields['subgroups'] should be a strings"
+        # for aggregations that conduct a comparison, expects a boolean is_baseline field
+        if self._function_does_comparison(fields["agg_func"][1]):
+            assert all(
+                "is_baseline" in instance["additional_inputs"] for instance in instances
+            ), f"since group_mean aggregation function {fields['agg_func'][1]} performs a comparison, each instance's additional_inputs dict must have a key is_baseline"
+            assert all(
+                (
+                    isinstance(instance["additional_inputs"]["is_baseline"], bool)
+                    or isinstance(
+                        instance["additional_inputs"]["is_baseline"], np.bool_
+                    )
+                )
+                for instance in instances
+            ), "is_baseline field must be boolean"
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
@@ -602,12 +580,13 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
 
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
-            aggregation_function = self.average_instance_scores
+            aggregation_function = self.average_item_scores
             if reduction_type == "mean":
                 reduction_fields = list(set(reduction_params))
-                instances_to_resample = deepcopy(instances)
+                # extract only the dict of instance scores
+                scores_to_resample = deepcopy(instances)
             elif reduction_type == "group_mean":
-                self._validate_group_mean_reduction()
+                self._validate_group_mean_reduction(instances=instances)
                 reduction_fields = (
                     [self.main_score]
                     if "score_fields" not in reduction_params
@@ -615,29 +594,14 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 )
                 aggregation_function_name = str(reduction_params["agg_func"][0])
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
+                if reduction_params["agg_func"][2]:
+                    field_name_full_prefix = "fixed_" + field_name_full_prefix
                 (
-                    instances_to_resample,
+                    scores_to_resample,
                     aggregation_function,
                 ) = self._set_up_group_mean_aggregation(
                     instances, reduction_params, reduction_fields
                 )
-            elif reduction_type == "group_mean_subgroup_comparison":
-                self._validate_group_mean_subgroup_comparison_reduction()
-                # same initial setup as group_mean reduction
-                reduction_fields = (
-                    [self.main_score]
-                    if "score_fields" not in reduction_params
-                    else list(set(reduction_params["score_fields"]))
-                )
-                aggregation_function_name = str(reduction_params["agg_func"][0])
-                field_name_full_prefix = "group_" + aggregation_function_name + "_"
-                # set up the aggregation function and the input to the confidence intervals
-                (
-                    instances_to_resample,
-                    aggregation_function,
-                ) = self._set_up_group_mean_subgroup_comparison_aggregation(
-                    instances, reduction_params, reduction_fields
-                )
             else:
                 raise ValueError(
                     f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
@@ -647,14 +611,14 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             for field_name in reduction_fields:
                 field_name_full = field_name_full_prefix + field_name
                 # if group resampling (3rd element of agg_func parameter) is True, then
-                #   1. instances_to_resample are the group scores, and
+                #   1. scores_to_resample are the group scores, and
                 #   2. aggregation_function is to take the raw mean
                 # if no group resampling (3rd element of agg_func parameter) is False, then
-                #   1. instances_to_resample are the original instance scores, and
+                #   1. scores_to_resample are the original instance scores, and
                 #   2. aggregation_function is to apply the group aggregation from the instance scores
-                # either way, the application of aggregation_function to instances_to_resample yields the global score
+                # either way, the application of aggregation_function to scores_to_resample yields the global score
                 global_score[field_name_full] = aggregation_function(
-                    instances_to_resample, field_name
+                    scores_to_resample, field_name
                 )
                 if field_name == self.main_score:
                     global_score["score"] = global_score[field_name_full]
@@ -664,7 +628,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             # (will not automatically calculate CIs for fields in reduction map)
             if self.ci_scores is not None:
                 confidence_interval = self.score_based_confidence_interval(
-                    instances_to_resample,
+                    instances=scores_to_resample,
                     score_names=list(set(self.ci_scores)),
                     ci_score_prefix=field_name_full_prefix,
                     aggregation_func=aggregation_function,
@@ -701,7 +665,7 @@ def compute_instance_scores(
 
         return instances, global_score
 
-    def get_group_aggregated_instance_scores(
+    def get_group_scores(
         self, instances: List[dict], field_names: List[str], group_aggregation_func
     ):
         """Return a list of group aggregation function value for group_mean reduction.
@@ -717,87 +681,52 @@ def get_group_aggregated_instance_scores(
         from collections import defaultdict
 
         # two-level defaultdict: first is the grouping, second is the field name
-        group_to_instance_scores = defaultdict(lambda: defaultdict(list))
+        # first list is instance scores for baseline group, second is for comparison group (if applicable)
+        group_to_instance_scores = defaultdict(lambda: defaultdict(lambda: [[], []]))
+
+        # check if function has fields for baseline and others
+        func_does_comparison = self._function_does_comparison(group_aggregation_func)
+        if func_does_comparison:
+            assert (
+                "is_baseline" in instance["additional_inputs"] for instance in instances
+            ), "all instances must have field is_baseline in additional_inputs"
+
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
-            if self.grouping_field not in additional_inputs:
+            if "group_id" not in additional_inputs:
                 raise ValueError(
-                    f"Missing '{self.grouping_field}' from instance {instance}. "
+                    f"Missing 'group_id' from instance {instance}. "
                     f"This field is required for group based metric computation."
                 )
-            group_key = additional_inputs[self.grouping_field]
-            for field_name in field_names:
-                group_to_instance_scores[group_key][field_name].append(
-                    instance["score"]["instance"][field_name]
-                )
-
-        # a list where each element is a group (not an instance), and the dict corresponds to
-        # the aggregate scores of each field within that group
-        return [
-            {
-                "score": {
-                    "instance": {
-                        field_name: group_aggregation_func(scores[field_name])
-                        for field_name in field_names
-                    }
-                }
-            }
-            for scores in group_to_instance_scores.values()
-        ]
-
-    def get_group_aggregated_instance_scores_by_subgroups(
-        self,
-        instances: List[dict],
-        field_names: List[str],
-        group_aggregation_func,
-        subgroup_field: str,
-        baseline_name,
-    ):
-        """Return a list of group aggregation function value for group_mean_subgroup_comparison reduction.
-
-        Args:
-            instances: List of observation instances with instance-level scores (fields) computed.
-            field_names: List of instance score names in each instance to apply the aggregation function.
-            group_aggregation_func: Callable aggregation function with arguments baseline_scores and other_scores.
-            subgroup_field: name of field in instance additional_inputs that contains the subgroup identifier
-            baseline_name: value of subgroup_field that indicates a score belongs in baseline_scores.
-
-        Returns:
-            List of dicts, each corresponding to a group of instances (defined by grouping_field)
-        """
-        from collections import defaultdict
-
-        # two-level defaultdict: first is the grouping, second is the field name
-        # first list is instance scores for baseline group, second is for comparison group
-        group_to_instance_scores = defaultdict(lambda: defaultdict(lambda: [[], []]))
-
-        for instance in instances:
-            additional_inputs = instance["additional_inputs"]
-            for cc in [self.grouping_field, subgroup_field]:
-                if cc not in additional_inputs:
-                    raise ValueError(
-                        f"Missing '{cc}' from instance {instance}. "
-                        f"This field is required for group based metric computation."
-                    )
-            group_key = additional_inputs[self.grouping_field]
-            # indicator if is in the baseline group or not
-            is_baseline = str(additional_inputs[subgroup_field]) == baseline_name
-            # convert True (baseline) to 0, and False (others) to 1, store in respective groups
+            group_key = additional_inputs["group_id"]
+            # for functions that do baseline vs others group comparison
+            is_baseline = (
+                additional_inputs["is_baseline"] if func_does_comparison else True
+            )
+            # convert is_baseline=True (baseline) to 0, and False (others) to 1, store in respective groups
             idx = int(not is_baseline)
-
             for field_name in field_names:
                 group_to_instance_scores[group_key][field_name][idx].append(
                     instance["score"]["instance"][field_name]
                 )
 
-        # now for each group, take the aggregation function, comparing others to baseline
+        def agg_func(first, second):
+            if func_does_comparison:
+                # if is a comparison function, pass both lists
+                return group_aggregation_func(
+                    baseline_scores=first, other_scores=second
+                )
+            # otherwise pass the first list to the default argument
+            return group_aggregation_func(first)
+
+        # now apply this function to each group
         return [
             {
                 "score": {
                     "instance": {
-                        field_name: group_aggregation_func(
-                            baseline_scores=scores[field_name][0],
-                            other_scores=scores[field_name][1],
+                        field_name: agg_func(
+                            first=scores[field_name][0],
+                            second=scores[field_name][1],
                         )
                         for field_name in field_names
                     }
@@ -812,67 +741,27 @@ def _set_up_group_mean_aggregation(
         # if treat groups as units
         if reduction_params["agg_func"][2]:
             # pass the group aggregate---not instance---scores to resample as usual
-            aggregation_function = self.average_instance_scores
-            instances_to_resample = self.get_group_aggregated_instance_scores(
+            aggregation_function = self.average_item_scores
+            scores_to_resample = self.get_group_scores(
                 instances, reduction_fields, reduction_params["agg_func"][1]
             )
         else:
             # pass the instance scores to resample, and calculate the group aggregation on the resamplings
-            instances_to_resample = deepcopy(instances)
+            scores_to_resample = deepcopy(instances)
 
             def aggregation_function(
                 instances,
                 field_name,
                 group_aggregation_func=reduction_params["agg_func"][1],
             ):
-                group_scores = self.get_group_aggregated_instance_scores(
+                group_scores = self.get_group_scores(
                     instances, [field_name], group_aggregation_func
                 )
                 return nan_mean(
                     [group["score"]["instance"][field_name] for group in group_scores]
                 )
 
-        return instances_to_resample, aggregation_function
-
-    def _set_up_group_mean_subgroup_comparison_aggregation(
-        self, instances, reduction_params, reduction_fields
-    ):
-        # if treat groups as units
-        if reduction_params["agg_func"][2]:
-            # pass the group aggregate---not instance---scores to resample as usual
-            aggregation_function = self.average_instance_scores
-            instances_to_resample = (
-                self.get_group_aggregated_instance_scores_by_subgroups(
-                    instances,
-                    reduction_fields,
-                    group_aggregation_func=reduction_params["agg_func"][1],
-                    subgroup_field=reduction_params["subgroups"][0],
-                    baseline_name=reduction_params["subgroups"][1],
-                )
-            )
-        else:
-            # pass the instance scores to resample, and calculate the group aggregation on the resamplings
-            instances_to_resample = deepcopy(instances)
-
-            def aggregation_function(
-                instances,
-                field_name,
-                group_aggregation_func=reduction_params["agg_func"][1],
-                subgroup_field=reduction_params["subgroups"][0],
-                baseline_name=reduction_params["subgroups"][1],
-            ):
-                group_scores = self.get_group_aggregated_instance_scores_by_subgroups(
-                    instances,
-                    [field_name],
-                    group_aggregation_func,
-                    subgroup_field=subgroup_field,
-                    baseline_name=baseline_name,
-                )
-                return nan_mean(
-                    [group["score"]["instance"][field_name] for group in group_scores]
-                )
-
-        return instances_to_resample, aggregation_function
+        return scores_to_resample, aggregation_function
 
     @abstractmethod
     def compute(
@@ -2284,44 +2173,120 @@ def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=Fal
     return norm_h, how_signif[0]
 
 
+def mean_baseline_score(baseline_scores: List, other_scores: List):
+    """Return average score on the baseline only.
+
+    Args:
+        baseline_scores: a list of scores on baseline instances.
+        other_scores: a list of scores on instances that will be compared to the baseline.
+
+    Returns:
+        float value
+    """
+    baseline_scores, other_scores = validate_baseline_other_aggregation(
+        baseline_scores, other_scores
+    )
+    if len(baseline_scores) == 0:
+        # no scores to use
+        return np.nan
+    assert all(
+        0 <= score <= 1 for score in baseline_scores
+    ), "all scores must be in [0,1]"
+    return mean(baseline_scores)
+
+
+def mean_others_score(baseline_scores: List, other_scores: List):
+    """Return average score on the others only.
+
+    Args:
+        baseline_scores: a list of scores on baseline instances.
+        other_scores: a list of scores on instances that will be compared to the baseline.
+
+    Returns:
+        float value
+    """
+    baseline_scores, other_scores = validate_baseline_other_aggregation(
+        baseline_scores, other_scores
+    )
+    if len(other_scores) == 0:
+        # no scores to use
+        return np.nan
+    assert all(0 <= score <= 1 for score in other_scores), "all scores must be in [0,1]"
+    return mean(other_scores)
+
+
+# metrics using mean reduction
 class GroupMeanAccuracy(Accuracy):
-    grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["mean", mean, False]}}
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
 
 
 class FixedGroupMeanAccuracy(Accuracy):
     # the same as GroupMeanAccuracy, except the groups are fixed and are resampled together
-    grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["fixed_group_mean", mean, True]}}
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, True]}}
+
+
+# same as above, now using StringContainment
+class GroupMeanStringContainment(StringContainment):
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
+
+
+class FixedGroupMeanStringContainment(StringContainment):
+    # the same as GroupMeanStringContainment, except the groups are fixed and are resampled together
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, True]}}
 
 
-class GroupPDRAccuracy(Accuracy):
-    grouping_field = "group_id"
+# take only the (fixed) group mean of baseline or other (paraphrases) scores
+class FixedGroupMeanBaselineAccuracy(Accuracy):
     reduction_map = {
-        "group_mean_subgroup_comparison": {
-            "agg_func": ["pdr", performance_drop_rate, True],
-            "subgroups": ["variant_type", "original"],
+        "group_mean": {
+            "agg_func": ["mean_baseline", mean_baseline_score, True],
         }
     }
 
 
-class GroupMeanStringContainment(StringContainment):
-    grouping_field = "group_id"
-    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
+class FixedGroupMeanOthersAccuracy(Accuracy):
+    reduction_map = {
+        "group_mean": {
+            "agg_func": ["mean_others", mean_others_score, True],
+        }
+    }
+
+
+# same as above but using StringContainment
+class FixedGroupMeanBaselineStringContainment(StringContainment):
+    reduction_map = {
+        "group_mean": {
+            "agg_func": ["mean_baseline", mean_baseline_score, True],
+        }
+    }
+
+
+class FixedGroupMeanOthersStringContainment(StringContainment):
+    reduction_map = {
+        "group_mean": {
+            "agg_func": ["mean_others", mean_others_score, True],
+        }
+    }
 
 
-class GroupPDRStringContainment(StringContainment):
-    grouping_field = "group_id"
+# using PDR
+class FixedGroupPDRAccuracy(Accuracy):
     reduction_map = {
-        "group_mean_subgroup_comparison": {
+        "group_mean": {
+            "agg_func": ["pdr", performance_drop_rate, True],
+        }
+    }
+
+
+class FixedGroupPDRStringContainment(StringContainment):
+    reduction_map = {
+        "group_mean": {
             "agg_func": ["pdr", performance_drop_rate, True],
-            "subgroups": ["variant_type", "original"],
         }
     }
 
 
 class GroupMeanTokenOverlap(TokenOverlap):
-    grouping_field = "group_id"
     reduction_map = {
         "group_mean": {
             "agg_func": ["mean", nan_mean, False],
@@ -2330,21 +2295,18 @@ class GroupMeanTokenOverlap(TokenOverlap):
     }
 
 
-class GroupNormCohensHAccuracy(Accuracy):
-    grouping_field = "group_id"
+# using Cohens's h
+class FixedGroupNormCohensHAccuracy(Accuracy):
     reduction_map = {
-        "group_mean_subgroup_comparison": {
+        "group_mean": {
             "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
-            "subgroups": ["variant_type", "original"],
         }
     }
 
 
-class GroupNormCohensHStringContainment(StringContainment):
-    grouping_field = "group_id"
+class FixedGroupNormCohensHStringContainment(StringContainment):
     reduction_map = {
-        "group_mean_subgroup_comparison": {
+        "group_mean": {
             "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
-            "subgroups": ["variant_type", "original"],
         }
     }
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 51a77ca8e0..3932540b4d 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -2,6 +2,8 @@
 from copy import deepcopy
 from math import isnan
 
+import numpy as np
+
 from src.unitxt.logging_utils import get_logger
 from src.unitxt.metrics import (
     Accuracy,
@@ -11,13 +13,18 @@
     F1MicroMultiLabel,
     F1Weighted,
     FixedGroupMeanAccuracy,
+    FixedGroupMeanBaselineAccuracy,
+    FixedGroupMeanBaselineStringContainment,
+    FixedGroupMeanOthersAccuracy,
+    FixedGroupMeanOthersStringContainment,
+    FixedGroupMeanStringContainment,
+    FixedGroupNormCohensHAccuracy,
+    FixedGroupNormCohensHStringContainment,
+    FixedGroupPDRAccuracy,
+    FixedGroupPDRStringContainment,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
-    GroupNormCohensHAccuracy,
-    GroupNormCohensHStringContainment,
-    GroupPDRAccuracy,
-    GroupPDRStringContainment,
     Rouge,
     Squad,
     TokenOverlap,
@@ -71,23 +78,25 @@
     + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
 )
 
-# for group_mean_subgroup_comparison metrics, add a subgroup indicator (by default called 'variant_type')
+# for group_mean aggregations with a subgroup_comparison, add a baseline indicator
 # these groupings correspond in length to the group identifiers above
-VARIANT_TYPE = (
-    (["original"] + ["paraphrase"] * 4)
-    + (["original"] + ["paraphrase"] * 4)
-    + (["original"] + ["paraphrase"] * 3)
-    + ["original"]
-)
+IS_BASELINE = np.concatenate(
+    (
+        np.repeat(a=[True, False], repeats=[1, 4]),
+        np.repeat(a=[True, False], repeats=[1, 4]),
+        np.repeat(a=[True, False], repeats=[1, 3]),
+        np.repeat(a=[True, False], repeats=[1, 0]),
+    )
+).tolist()
 
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
 group_by_fields = ["group", "id"]
 
-for ai, vt in zip(GROUPED_INSTANCE_ADDL_INPUTS, VARIANT_TYPE):
+for ai, ib in zip(GROUPED_INSTANCE_ADDL_INPUTS, IS_BASELINE):
     ai.update(
         {
             "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "variant_type": vt,
+            "is_baseline": ib,
         }
     )
 
@@ -466,22 +475,32 @@ def test_grouped_instance_metrics(self):
         accuracy_metrics = [
             FixedGroupMeanAccuracy(),
             GroupMeanAccuracy(),
+            FixedGroupMeanStringContainment(),
             GroupMeanStringContainment(),
-            GroupPDRAccuracy(),
-            GroupPDRStringContainment(),
-            GroupNormCohensHAccuracy(),
-            GroupNormCohensHStringContainment(),
+            FixedGroupMeanBaselineAccuracy(),
+            FixedGroupMeanOthersAccuracy(),
+            FixedGroupMeanBaselineStringContainment(),
+            FixedGroupMeanOthersStringContainment(),
             GroupMeanTokenOverlap(),
+            FixedGroupNormCohensHAccuracy(),
+            FixedGroupNormCohensHStringContainment(),
+            FixedGroupPDRAccuracy(),
+            FixedGroupPDRStringContainment(),
         ]
         global_targets = [
             0.225,
             0.225,
             0.4875,
-            0.8333333333333334,
-            0.4444444444444445,
+            0.4875,
+            0.5,
+            0.19444444444444442,
+            0.75,
+            0.5555555555555555,
+            0.5083333333333333,
             -0.4249467048786864,
             -0.4639421840102023,
-            0.5083333333333333,
+            0.8333333333333334,
+            0.4444444444444445,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -493,34 +512,19 @@ def test_grouped_instance_metrics(self):
             self.assertAlmostEqual(
                 target,
                 outputs[0]["score"]["global"]["score"],
-                msg=f"{outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
+                msg=f"metric {metric.__class__.__name__} output {outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
             )
 
     def test_grouped_instance_metric_errors(self):
         """Test certain value and assertion error raises for grouped instance metrics (with group_mean reduction)."""
-        from statistics import mean
-
-        class NoGroupField(Accuracy):
-            reduction_map = {"group_mean": {"agg_func": ["mean", mean, True]}}
-
-        with self.assertRaises(ValueError):
-            # should raise error because no grouping_field
-            metric = NoGroupField()
-            apply_metric(
-                metric=metric,
-                predictions=GROUPED_INSTANCE_PREDICTIONS,
-                references=GROUPED_INSTANCE_REFERENCES,
-                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
-            )
-
         from dataclasses import field
+        from statistics import mean
         from typing import List
 
         class NoAggFuncReduction(Accuracy):
             implemented_reductions: List[str] = field(
                 default_factory=lambda: ["mean", "group_mean", "some_other_func"]
             )
-            grouping_field = "group_id"
             reduction_map = {"some_other_func": {"agg_func": ["mean", mean, False]}}
 
         with self.assertRaises(ValueError):
@@ -534,7 +538,6 @@ class NoAggFuncReduction(Accuracy):
             )
 
         class NoAggFunc(Accuracy):
-            grouping_field = "group_id"
             reduction_map = {"group_mean": {"func": ["mean", mean]}}
 
         with self.assertRaises(AssertionError):
@@ -548,7 +551,6 @@ class NoAggFunc(Accuracy):
             )
 
         class NoCallableAggFunc(Accuracy):
-            grouping_field = "group_id"
             reduction_map = {"group_mean": {"agg_func": ["mean", "some string", False]}}
 
         with self.assertRaises(AssertionError):
@@ -561,22 +563,7 @@ class NoCallableAggFunc(Accuracy):
                 additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
             )
 
-        class WrongGroupID(Accuracy):
-            grouping_field = "random_id_name"
-            reduction_map = {"group_mean": {"agg_func": ["mean", mean, False]}}
-
-        with self.assertRaises(ValueError):
-            # should raise error because grouping_field is not found in the additional inputs
-            metric = WrongGroupID()
-            apply_metric(
-                metric=metric,
-                predictions=GROUPED_INSTANCE_PREDICTIONS,
-                references=GROUPED_INSTANCE_REFERENCES,
-                additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
-            )
-
         class NoBooleanGrouping(Accuracy):
-            grouping_field = "group_id"
             reduction_map = {"group_mean": {"agg_func": ["mean", mean, 1]}}
 
         with self.assertRaises(AssertionError):
@@ -694,6 +681,12 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=0.44105968464125495,
         )
 
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupMeanStringContainment(),
+            expected_ci_low=0.0,
+            expected_ci_high=0.675,
+        )
+
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanStringContainment(),
             expected_ci_low=0.15556138609239942,
@@ -701,33 +694,53 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=GroupPDRAccuracy(),
-            expected_ci_low=0.6666666666666666,
+            metric=FixedGroupMeanBaselineAccuracy(),
+            expected_ci_low=0.0,
             expected_ci_high=1.0,
-            reduction_name="group_mean_subgroup_comparison",
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=GroupPDRStringContainment(),
-            expected_ci_low=0.3333333333333333,
-            expected_ci_high=0.5,
-            reduction_name="group_mean_subgroup_comparison",
+            metric=FixedGroupMeanOthersAccuracy(),
+            expected_ci_low=0.0,
+            expected_ci_high=0.3333333333333333,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=GroupNormCohensHAccuracy(),
+            metric=FixedGroupMeanBaselineStringContainment(),
+            expected_ci_low=0.25,
+            expected_ci_high=1.0,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupMeanOthersStringContainment(),
+            expected_ci_low=0.5,
+            expected_ci_high=0.6666666666666666,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupNormCohensHAccuracy(),
             expected_ci_low=-1.0,
             expected_ci_high=0.33333333333333337,
-            reduction_name="group_mean_subgroup_comparison",
         )
 
         # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results
         # hence this test may fail on a PC
         self._test_grouped_instance_confidence_interval(
-            metric=GroupNormCohensHStringContainment(),
+            metric=FixedGroupNormCohensHStringContainment(),
             expected_ci_low=-0.49999999999999994,
             expected_ci_high=-0.39182655203060723,
-            reduction_name="group_mean_subgroup_comparison",
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupPDRAccuracy(),
+            expected_ci_low=0.6666666666666666,
+            expected_ci_high=1.0,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupPDRStringContainment(),
+            expected_ci_low=0.3333333333333333,
+            expected_ci_high=0.5,
         )
 
         # pass global dict because there are additional fields other than the main score
@@ -756,7 +769,6 @@ def _test_grouped_instance_confidence_interval(
         expected_ci_low=0.0,
         expected_ci_high=1.0,
         expected_global_result=None,
-        reduction_name="group_mean",
     ):
         """Test the calculation of confidence intervals for a given metric with group_mean reduction."""
         outputs = apply_metric(
@@ -765,11 +777,13 @@ def _test_grouped_instance_confidence_interval(
             references=GROUPED_INSTANCE_REFERENCES,
             additional_inputs=GROUPED_INSTANCE_ADDL_INPUTS,
         )
-
+        # get first element of reduction_map values
+        reduction_params = next(iter(metric.reduction_map.values()))
+        prefix = "fixed_group" if reduction_params["agg_func"][2] else "group"
         group_score_name = "_".join(
             [
-                "group",
-                metric.reduction_map[reduction_name]["agg_func"][0],
+                prefix,
+                metric.reduction_map["group_mean"]["agg_func"][0],
                 metric.main_score,
             ]
         )
@@ -790,7 +804,7 @@ def _test_grouped_instance_confidence_interval(
                     score_value,
                     expected_global_result[score_name],
                     places=5,
-                    msg=f"score mismatch for {group_score_name}, got {expected_global_result[score_name]} but expected {score_value}",
+                    msg=f"{group_score_name} score mismatch for {metric.__class__.__name__}, got {expected_global_result[score_name]} but expected {score_value}",
                 )
             else:
                 # An output score that is not expected

From 84332dd8a69a03a1be3250ce56f4ffa41de08cd4 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 31 Jan 2024 22:56:13 +0200
Subject: [PATCH 40/83] import statistics.mean at the top

---
 src/unitxt/metrics.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 964155e45b..462c1f142f 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -5,6 +5,7 @@
 from collections import Counter
 from copy import deepcopy
 from dataclasses import field
+from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
 
 import evaluate
@@ -1053,8 +1054,6 @@ def compute(
             average=self.average,
         )
         if isinstance(result["f1"], numpy.ndarray):
-            from statistics import mean
-
             final_result = {self.main_score: mean(result["f1"])}
             for i, label in enumerate(labels):
                 final_result["f1_" + self.id_to_str[label]] = result["f1"][i]
@@ -1148,8 +1147,6 @@ def compute(
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
-            from statistics import mean
-
             assert (
                 len(result[self.metric]) == len(labels)
             ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
@@ -1911,7 +1908,6 @@ def compute(
         additional_inputs: List[Any],
     ) -> dict:
         from collections import defaultdict
-        from statistics import mean
 
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
         for reference, pred, inputs_dict in zip(

From 3ae446c397104497b63fe350fc1ee201b72188dd Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 31 Jan 2024 23:10:12 +0200
Subject: [PATCH 41/83] remove __name__

---
 tests/test_metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 3932540b4d..0261c8dc5e 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -512,7 +512,7 @@ def test_grouped_instance_metrics(self):
             self.assertAlmostEqual(
                 target,
                 outputs[0]["score"]["global"]["score"],
-                msg=f"metric {metric.__class__.__name__} output {outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
+                msg=f"metric {metric} output {outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
             )
 
     def test_grouped_instance_metric_errors(self):
@@ -804,7 +804,7 @@ def _test_grouped_instance_confidence_interval(
                     score_value,
                     expected_global_result[score_name],
                     places=5,
-                    msg=f"{group_score_name} score mismatch for {metric.__class__.__name__}, got {expected_global_result[score_name]} but expected {score_value}",
+                    msg=f"{group_score_name} score mismatch for {metric}, got {expected_global_result[score_name]} but expected {score_value}",
                 )
             else:
                 # An output score that is not expected

From bf72218c8ff9bbe16009e7148ff9e881b6c85d72 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Thu, 1 Feb 2024 12:09:30 +0200
Subject: [PATCH 42/83] Delete
 src/unitxt/catalog/metrics/group_mean_accuracy.json

replace with file in robustness directory
---
 src/unitxt/catalog/metrics/group_mean_accuracy.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/group_mean_accuracy.json

diff --git a/src/unitxt/catalog/metrics/group_mean_accuracy.json b/src/unitxt/catalog/metrics/group_mean_accuracy.json
deleted file mode 100644
index 6aa448f66e..0000000000
--- a/src/unitxt/catalog/metrics/group_mean_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_mean_accuracy"
-}

From d443846f3f76a85dd41927bfcf7e54d5bdb895b6 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 1 Feb 2024 12:11:13 +0200
Subject: [PATCH 43/83] remove from catalog

---
 src/unitxt/catalog/metrics/group_mean_string_containment.json  | 3 ---
 src/unitxt/catalog/metrics/group_mean_token_overlap.json       | 3 ---
 src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json   | 3 ---
 .../metrics/group_norm_cohens_h_string_containment.json        | 3 ---
 src/unitxt/catalog/metrics/group_pdr_accuracy.json             | 3 ---
 src/unitxt/catalog/metrics/group_pdr_string_containment.json   | 3 ---
 6 files changed, 18 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/group_mean_string_containment.json
 delete mode 100644 src/unitxt/catalog/metrics/group_mean_token_overlap.json
 delete mode 100644 src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
 delete mode 100644 src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json
 delete mode 100644 src/unitxt/catalog/metrics/group_pdr_accuracy.json
 delete mode 100644 src/unitxt/catalog/metrics/group_pdr_string_containment.json

diff --git a/src/unitxt/catalog/metrics/group_mean_string_containment.json b/src/unitxt/catalog/metrics/group_mean_string_containment.json
deleted file mode 100644
index 0d34e5d851..0000000000
--- a/src/unitxt/catalog/metrics/group_mean_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_mean_string_containment"
-}
diff --git a/src/unitxt/catalog/metrics/group_mean_token_overlap.json b/src/unitxt/catalog/metrics/group_mean_token_overlap.json
deleted file mode 100644
index 4487385870..0000000000
--- a/src/unitxt/catalog/metrics/group_mean_token_overlap.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_mean_token_overlap"
-}
diff --git a/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json b/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
deleted file mode 100644
index 860926ed3c..0000000000
--- a/src/unitxt/catalog/metrics/group_norm_cohens_h_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_norm_cohens_h_accuracy"
-}
diff --git a/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json b/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json
deleted file mode 100644
index 7fcc79b0e3..0000000000
--- a/src/unitxt/catalog/metrics/group_norm_cohens_h_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_norm_cohens_h_string_containment"
-}
diff --git a/src/unitxt/catalog/metrics/group_pdr_accuracy.json b/src/unitxt/catalog/metrics/group_pdr_accuracy.json
deleted file mode 100644
index f56a12e782..0000000000
--- a/src/unitxt/catalog/metrics/group_pdr_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_pdr_accuracy"
-}
diff --git a/src/unitxt/catalog/metrics/group_pdr_string_containment.json b/src/unitxt/catalog/metrics/group_pdr_string_containment.json
deleted file mode 100644
index c1bd327dd0..0000000000
--- a/src/unitxt/catalog/metrics/group_pdr_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "group_pdr_string_containment"
-}

From bd65681f2ca83fb5d89c03353db6ddd43bfdaed5 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 1 Feb 2024 12:11:35 +0200
Subject: [PATCH 44/83] move to own directory

---
 .../catalog/metrics/robustness/fixed_group_mean_accuracy.json  | 3 +++
 .../metrics/robustness/fixed_group_mean_baseline_accuracy.json | 3 +++
 .../fixed_group_mean_baseline_string_containment.json          | 3 +++
 .../metrics/robustness/fixed_group_mean_others_accuracy.json   | 3 +++
 .../robustness/fixed_group_mean_others_string_containment.json | 3 +++
 .../robustness/fixed_group_mean_string_containment.json        | 3 +++
 .../metrics/robustness/fixed_group_norm_cohens_h_accuracy.json | 3 +++
 .../fixed_group_norm_cohens_h_string_containment.json          | 3 +++
 .../catalog/metrics/robustness/fixed_group_pdr_accuracy.json   | 3 +++
 .../metrics/robustness/fixed_group_pdr_string_containment.json | 3 +++
 src/unitxt/catalog/metrics/robustness/group_mean_accuracy.json | 3 +++
 .../metrics/robustness/group_mean_string_containment.json      | 3 +++
 .../catalog/metrics/robustness/group_mean_token_overlap.json   | 3 +++
 13 files changed, 39 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/group_mean_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/group_mean_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/group_mean_token_overlap.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_accuracy.json
new file mode 100644
index 0000000000..0f57c3f77e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_accuracy.json
new file mode 100644
index 0000000000..8530c26b9e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_baseline_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_string_containment.json
new file mode 100644
index 0000000000..e46422cf8d
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_baseline_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_baseline_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
new file mode 100644
index 0000000000..fe41681ff1
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_others_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json
new file mode 100644
index 0000000000..08ad6bca43
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_others_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_string_containment.json
new file mode 100644
index 0000000000..6d007ef52a
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json
new file mode 100644
index 0000000000..dc5f597162
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_cohens_h_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json
new file mode 100644
index 0000000000..e72af8061e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_cohens_h_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json
new file mode 100644
index 0000000000..56b62a01f9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_pdr_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json
new file mode 100644
index 0000000000..9b13641d2a
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_pdr_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/group_mean_accuracy.json b/src/unitxt/catalog/metrics/robustness/group_mean_accuracy.json
new file mode 100644
index 0000000000..6aa448f66e
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/group_mean_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_mean_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/group_mean_string_containment.json b/src/unitxt/catalog/metrics/robustness/group_mean_string_containment.json
new file mode 100644
index 0000000000..0d34e5d851
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/group_mean_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_mean_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/group_mean_token_overlap.json b/src/unitxt/catalog/metrics/robustness/group_mean_token_overlap.json
new file mode 100644
index 0000000000..4487385870
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/group_mean_token_overlap.json
@@ -0,0 +1,3 @@
+{
+    "type": "group_mean_token_overlap"
+}

From 5933f33117ea837a390c9614e300ca6199fc19c6 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 1 Feb 2024 12:11:50 +0200
Subject: [PATCH 45/83] return class name

---
 tests/test_metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 0261c8dc5e..3932540b4d 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -512,7 +512,7 @@ def test_grouped_instance_metrics(self):
             self.assertAlmostEqual(
                 target,
                 outputs[0]["score"]["global"]["score"],
-                msg=f"metric {metric} output {outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
+                msg=f"metric {metric.__class__.__name__} output {outputs[0]['score']['global']['score_name']} does not equal the expected value {target}",
             )
 
     def test_grouped_instance_metric_errors(self):
@@ -804,7 +804,7 @@ def _test_grouped_instance_confidence_interval(
                     score_value,
                     expected_global_result[score_name],
                     places=5,
-                    msg=f"{group_score_name} score mismatch for {metric}, got {expected_global_result[score_name]} but expected {score_value}",
+                    msg=f"{group_score_name} score mismatch for {metric.__class__.__name__}, got {expected_global_result[score_name]} but expected {score_value}",
                 )
             else:
                 # An output score that is not expected

From ccced31ad50d14d73af767932394002d735a9dc7 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 1 Feb 2024 12:12:22 +0200
Subject: [PATCH 46/83] write metrics to robustness directory in catalog

---
 prepare/metrics/grouped_instance_metrics.py | 44 +++++++++++++++------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 2c7c5da041..fe300dc56b 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -152,7 +152,7 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_mean_accuracy", overwrite=True)
+add_to_catalog(metric, "metrics.robustness.fixed_group_mean_accuracy", overwrite=True)
 
 
 metric = GroupMeanAccuracy()
@@ -176,7 +176,7 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_mean_accuracy", overwrite=True)
+add_to_catalog(metric, "metrics.robustness.group_mean_accuracy", overwrite=True)
 
 # group mean string containment, fixed and not
 
@@ -201,7 +201,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_mean_string_containment", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_mean_string_containment", overwrite=True
+)
 
 
 metric = GroupMeanStringContainment()
@@ -225,7 +227,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_mean_string_containment", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.group_mean_string_containment", overwrite=True
+)
 
 
 # Group mean of baseline or other scores
@@ -249,7 +253,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_mean_baseline_accuracy", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_mean_baseline_accuracy", overwrite=True
+)
 
 metric = FixedGroupMeanOthersAccuracy()
 global_target = {
@@ -271,7 +277,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_mean_others_accuracy", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_mean_others_accuracy", overwrite=True
+)
 
 
 metric = FixedGroupMeanBaselineStringContainment()
@@ -295,7 +303,9 @@
 )
 
 add_to_catalog(
-    metric, "metrics.fixed_group_mean_baseline_string_containment", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_mean_baseline_string_containment",
+    overwrite=True,
 )
 
 metric = FixedGroupMeanOthersStringContainment()
@@ -319,7 +329,9 @@
 )
 
 add_to_catalog(
-    metric, "metrics.fixed_group_mean_others_string_containment", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_mean_others_string_containment",
+    overwrite=True,
 )
 
 
@@ -345,7 +357,7 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_pdr_accuracy", overwrite=True)
+add_to_catalog(metric, "metrics.robustness.fixed_group_pdr_accuracy", overwrite=True)
 
 metric = FixedGroupPDRStringContainment()
 global_target = {
@@ -368,7 +380,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_pdr_string_containment", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_pdr_string_containment", overwrite=True
+)
 
 # Cohen's H will always use fixed groups
 metric = FixedGroupNormCohensHAccuracy()
@@ -392,7 +406,9 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.fixed_group_norm_cohens_h_accuracy", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_norm_cohens_h_accuracy", overwrite=True
+)
 
 
 metric = FixedGroupNormCohensHStringContainment()
@@ -417,7 +433,9 @@
 )
 
 add_to_catalog(
-    metric, "metrics.fixed_group_norm_cohens_h_string_containment", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_norm_cohens_h_string_containment",
+    overwrite=True,
 )
 
 # TokenOverlap: example of a metric that has more than one score
@@ -468,4 +486,4 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.group_mean_token_overlap", overwrite=True)
+add_to_catalog(metric, "metrics.robustness.group_mean_token_overlap", overwrite=True)

From 430c1a56657e5bf4b051990d4f5cb7be7c31bd44 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 5 Feb 2024 16:31:49 +0200
Subject: [PATCH 47/83] rename others to paraphrase; use variant_score_dict
 rather than is_baseline boolean indicator, to accomodate cases where there
 are >2 variant types and we want to run two or more metrics on them

---
 prepare/metrics/grouped_instance_metrics.py   |  47 ++--
 .../fixed_group_mean_others_accuracy.json     |   2 +-
 src/unitxt/metrics.py                         | 257 ++++++++++--------
 tests/test_metrics.py                         |  30 +-
 4 files changed, 180 insertions(+), 156 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index fe300dc56b..c2aca38f20 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -7,8 +7,8 @@
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
-    FixedGroupMeanOthersAccuracy,
-    FixedGroupMeanOthersStringContainment,
+    FixedGroupMeanParaphraseAccuracy,
+    FixedGroupMeanParaphraseStringContainment,
     FixedGroupMeanStringContainment,
     FixedGroupNormCohensHAccuracy,
     FixedGroupNormCohensHStringContainment,
@@ -64,24 +64,23 @@
     + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
     + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
 )
-# for group_mean aggregations with a subgroup_comparison, add a baseline indicator
+# for group_mean aggregations with a subgroup_comparison, add a variant_type label
 # these groupings correspond in length to the group identifiers above
-is_baseline = np.concatenate(
-    (
-        np.repeat(a=[True, False], repeats=[1, 4]),
-        np.repeat(a=[True, False], repeats=[1, 4]),
-        np.repeat(a=[True, False], repeats=[1, 3]),
-        np.repeat(a=[True, False], repeats=[1, 0]),
-    )
+variant_type = np.concatenate(
+    [
+        np.repeat(a=["original", "paraphrase"], repeats=reps)
+        for reps in [[1, 4], [1, 4], [1, 3], [1, 0]]
+    ]
 ).tolist()
+
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
 group_by_fields = ["group", "id"]
 
-for ai, ib in zip(additional_inputs, is_baseline):
+for ai, vt in zip(additional_inputs, variant_type):
     ai.update(
         {
             "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "is_baseline": ib,
+            "variant_type": vt,
         }
     )
 
@@ -257,15 +256,15 @@
     metric, "metrics.robustness.fixed_group_mean_baseline_accuracy", overwrite=True
 )
 
-metric = FixedGroupMeanOthersAccuracy()
+metric = FixedGroupMeanParaphraseAccuracy()
 global_target = {
-    "fixed_group_mean_others_accuracy": 0.19,
+    "fixed_group_mean_paraphrase_accuracy": 0.19,
     "score": 0.19,
-    "score_name": "fixed_group_mean_others_accuracy",
+    "score_name": "fixed_group_mean_paraphrase_accuracy",
     "score_ci_low": 0.0,
     "score_ci_high": 0.33,
-    "fixed_group_mean_others_accuracy_ci_low": 0.0,
-    "fixed_group_mean_others_accuracy_ci_high": 0.33,
+    "fixed_group_mean_paraphrase_accuracy_ci_low": 0.0,
+    "fixed_group_mean_paraphrase_accuracy_ci_high": 0.33,
 }
 
 outputs = test_metric(
@@ -278,7 +277,7 @@
 )
 
 add_to_catalog(
-    metric, "metrics.robustness.fixed_group_mean_others_accuracy", overwrite=True
+    metric, "metrics.robustness.fixed_group_mean_paraphrase_accuracy", overwrite=True
 )
 
 
@@ -308,15 +307,15 @@
     overwrite=True,
 )
 
-metric = FixedGroupMeanOthersStringContainment()
+metric = FixedGroupMeanParaphraseStringContainment()
 global_target = {
-    "fixed_group_mean_others_string_containment": 0.56,
+    "fixed_group_mean_paraphrase_string_containment": 0.56,
     "score": 0.56,
-    "score_name": "fixed_group_mean_others_string_containment",
+    "score_name": "fixed_group_mean_paraphrase_string_containment",
     "score_ci_low": 0.5,
     "score_ci_high": 0.67,
-    "fixed_group_mean_others_string_containment_ci_low": 0.5,
-    "fixed_group_mean_others_string_containment_ci_high": 0.67,
+    "fixed_group_mean_paraphrase_string_containment_ci_low": 0.5,
+    "fixed_group_mean_paraphrase_string_containment_ci_high": 0.67,
 }
 
 outputs = test_metric(
@@ -330,7 +329,7 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_mean_others_string_containment",
+    "metrics.robustness.fixed_group_mean_paraphrase_string_containment",
     overwrite=True,
 )
 
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
index fe41681ff1..2ae1094239 100644
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
@@ -1,3 +1,3 @@
 {
-    "type": "fixed_group_mean_others_accuracy"
+    "type": "fixed_group_mean_paraphrase_accuracy"
 }
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 462c1f142f..c309448e47 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -459,12 +459,12 @@ def reduction_map(self) -> dict:
         pass
 
     @staticmethod
-    def _function_does_comparison(aggregation_func):
+    def _function_uses_variant_types(aggregation_func):
         import inspect
 
         func_args = list(inspect.signature(aggregation_func).parameters.keys())
-        # if function has both these arguments, assume it does comparison (i.e. it expects both arguments)
-        return all(fa in func_args for fa in ["baseline_scores", "other_scores"])
+        # if function has this argument, assume it does comparison
+        return "variant_scores_dict" in func_args
 
     def _validate_group_mean_reduction(self, instances: List[dict]):
         """Ensure that group_mean reduction_map is properly formatted.
@@ -490,11 +490,11 @@ class GroupVarianceAccuracy(Accuracy):
         The aggregation function (2nd element of agg_func) can be one of two types:
         1. simple: calculate a summary statistic from a single group of values (e.g. mean, median, etc.).
             This is best suited for cases where the instances are independent of each other, other than belonging to the same group
-        2. comparison: requires additional_inputs to have a boolean key 'is_baseline'.  This function conducts
-            a comparison between baseline instances (is_baseline=True) and other (is_baseline=False).
+        2. comparison: requires additional_inputs to have a boolean key 'variant_type'.  This function conducts
+            a comparison between scores for differing variant_types (e.g., 'original' vs 'paraphrase').
             An example is where the baseline instance is a question, and the others are various paraphrases
             or perturbations of this question.  Here, the function would return, say, a comparison of the instance accuracies
-            rather than, say, the average instance accuracy.  It requires arguments 'baseline_scores' and 'other_scores'
+            rather than, say, the average instance accuracy.  It requires an argument variant_scores_dict.
             In these cases, we recommend setting the 3rd parameter to be True so that the groups are resampled together.
 
         Example:
@@ -502,18 +502,18 @@ class GroupVsBaselineDiffAccuracy(Accuracy):
                 reduction_map = {'group_mean': {'agg_func': ['accuracy_diff', accuracy_diff, True],}}
 
             # where the function is defined as
-            def accuracy_diff(baseline_scores, other_scores):
+            def accuracy_diff(variant_scores_dict, expected_variant_types=['original', 'paraphrase']):
+                validate_variant_types(variant_scores_dict, expected_variant_types)
                 from statistics import mean
-                return mean(other_scores) - mean(baseline_scores)
-
+                return mean(variant_scores_dict['paraphrase']) - mean(variant_scores_dict['original'])
             The input dataset should look like:
 
-            'group_id'  'question'                                  'is_baseline'
-            1           'How do you fix a car engine?'               True
-            1           'What is the best way to fix an engine?'     False
-            1           'How do you repair a car engine?'            False
-            1           'How do I repair my engine?'                 False
-            2           'Why are ants eating my food?'               True
+            'group_id'  'question'                                   'variant_type'
+            1           'How do you fix a car engine?'               'original'
+            1           'What is the best way to fix an engine?'     'paraphrase'
+            1           'How do you repair a car engine?'            'paraphrase'
+            1           'How do I repair my engine?'                 'paraphrase'
+            2           'Why are ants eating my food?'               'original'
         """
         # instances need to all have additional_inputs field with field group_id
         assert all(
@@ -554,20 +554,12 @@ def accuracy_diff(baseline_scores, other_scores):
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
-        # for aggregations that conduct a comparison, expects a boolean is_baseline field
-        if self._function_does_comparison(fields["agg_func"][1]):
-            assert all(
-                "is_baseline" in instance["additional_inputs"] for instance in instances
-            ), f"since group_mean aggregation function {fields['agg_func'][1]} performs a comparison, each instance's additional_inputs dict must have a key is_baseline"
+        # for aggregations that conduct a comparison, expects a 'variant_type' field
+        if self._function_uses_variant_types(fields["agg_func"][1]):
             assert all(
-                (
-                    isinstance(instance["additional_inputs"]["is_baseline"], bool)
-                    or isinstance(
-                        instance["additional_inputs"]["is_baseline"], np.bool_
-                    )
-                )
+                "variant_type" in instance["additional_inputs"]
                 for instance in instances
-            ), "is_baseline field must be boolean"
+            ), f"since group_mean aggregation function {fields['agg_func'][1]} performs a comparison, each instance's additional_inputs dict must have a key variant_type"
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
@@ -673,24 +665,40 @@ def get_group_scores(
         Args:
             instances: List of observation instances with instance-level scores (fields) computed.
             field_names: List of instance score names in each instance to apply the aggregation function.
-            group_aggregation_func: Callable aggregation function accepting a list of numeric scores and returning a single score .
+            group_aggregation_func: Callable aggregation function accepting a list of numeric scores, or a dict of variant types scores,
+                and returning a single score.
 
         Returns:
-            List of dicts, each corresponding to a group of instances (defined by grouping_field)
+            List of dicts, each corresponding to a group of instances (defined by grouping_field),
+                with a group score for each field_name
         """
         from collections import defaultdict
 
-        # two-level defaultdict: first is the grouping, second is the field name
-        # first list is instance scores for baseline group, second is for comparison group (if applicable)
-        group_to_instance_scores = defaultdict(lambda: defaultdict(lambda: [[], []]))
+        # three-level defaultdict:
+        # first is the grouping, second is the field name, the third is the variant_type (by default 'original')
+        group_to_instance_scores = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(list))
+        )
 
-        # check if function has fields for baseline and others
-        func_does_comparison = self._function_does_comparison(group_aggregation_func)
-        if func_does_comparison:
-            assert (
-                "is_baseline" in instance["additional_inputs"] for instance in instances
-            ), "all instances must have field is_baseline in additional_inputs"
+        # check if function has fields for variant_type
+        uses_variant_type = self._function_uses_variant_types(group_aggregation_func)
+        if uses_variant_type:
+            assert all(
+                "variant_type" in instance["additional_inputs"]
+                for instance in instances
+            ), "all instances must have field 'variant_type' in additional_inputs"
 
+            # define the aggregation function
+            def agg_func(variant_scores_dict):
+                # if function a uses the variant types, pass the full dict
+                return group_aggregation_func(variant_scores_dict=variant_scores_dict)
+        else:
+
+            def agg_func(variant_scores_dict):
+                # otherwise pass the default 'original' scores to the default argument
+                return group_aggregation_func(variant_scores_dict["original"])
+
+        # loop through the instances and group the scores
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
             if "group_id" not in additional_inputs:
@@ -699,36 +707,22 @@ def get_group_scores(
                     f"This field is required for group based metric computation."
                 )
             group_key = additional_inputs["group_id"]
-            # for functions that do baseline vs others group comparison
-            is_baseline = (
-                additional_inputs["is_baseline"] if func_does_comparison else True
+            # for functions that do comparisons between variant_type gorups
+            variant_type = (
+                additional_inputs["variant_type"] if uses_variant_type else "original"
             )
-            # convert is_baseline=True (baseline) to 0, and False (others) to 1, store in respective groups
-            idx = int(not is_baseline)
             for field_name in field_names:
-                group_to_instance_scores[group_key][field_name][idx].append(
+                group_to_instance_scores[group_key][field_name][variant_type].append(
                     instance["score"]["instance"][field_name]
                 )
 
-        def agg_func(first, second):
-            if func_does_comparison:
-                # if is a comparison function, pass both lists
-                return group_aggregation_func(
-                    baseline_scores=first, other_scores=second
-                )
-            # otherwise pass the first list to the default argument
-            return group_aggregation_func(first)
-
-        # now apply this function to each group
+        # now apply the appropriate aggregation function to each group
         return [
             {
                 "score": {
                     "instance": {
-                        field_name: agg_func(
-                            first=scores[field_name][0],
-                            second=scores[field_name][1],
-                        )
-                        for field_name in field_names
+                        field_name: agg_func(score_dict)
+                        for field_name, score_dict in scores.items()
                     }
                 }
             }
@@ -2106,22 +2100,43 @@ def should_ignore_element(self, element, additional_input):
 
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
-def validate_baseline_other_aggregation(baseline_scores, other_scores):
-    assert isinstance(baseline_scores, list)
-    assert isinstance(other_scores, list)
-    baseline_scores = [vv for vv in baseline_scores if not np.isnan(vv)]
-    other_scores = [vv for vv in other_scores if not np.isnan(vv)]
-    return baseline_scores, other_scores
+def validate_variant_types(
+    variant_scores_dict: dict[List], expected_variant_types: None
+):
+    """Validate a dict of variant type instance score lists.
 
+    Args:
+        variant_scores_dict: dict where keys are variant types and values are lists of instance scores.
+        expected_variant_types: list of the variant types which should exist in variant_scores_dict, so
+            that another function that receives it as input will valid.
 
-def performance_drop_rate(baseline_scores: List, other_scores: List):
+    Returns:
+        dict with all NaN scores removed; any expected keys that are missing have an empty list inserted
+    """
+    # remove any NaNs
+    variant_scores_dict.update(
+        {
+            kk: [vvv for vvv in vv if not np.isnan(vvv)]
+            for kk, vv in variant_scores_dict.items()
+        }
+    )
+    # make sure the expected types appear
+    variant_scores_dict.update(
+        {kk: [] for kk in expected_variant_types if kk not in variant_scores_dict}
+    )
+    return variant_scores_dict
+
+
+def performance_drop_rate(variant_scores_dict: dict, expected_variant_types=None):
     """Percentage decrease of mean performance on test elements relative to that on a baseline.
 
     from https://arxiv.org/pdf/2306.04528.pdf.
 
     Args:
-        baseline_scores: a list of scores on baseline instances.
-        other_scores: a list of scores on instances that will be compared to the baseline.
+        variant_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
+            of instance scores corresponding to variant_types with that key
+        expected_variant_types: tuple of the expected labels in variant_scores_dict (the first should be the
+            baseline type, the second the other).
 
     Returns:
         numeric PDR metric.
@@ -2129,18 +2144,24 @@ def performance_drop_rate(baseline_scores: List, other_scores: List):
         otherwise, calculate PDR
 
     """
-    baseline_scores, other_scores = validate_baseline_other_aggregation(
-        baseline_scores, other_scores
+    if expected_variant_types is None:
+        expected_variant_types = ["original", "paraphrase"]
+    assert len(expected_variant_types) == 2
+    variant_scores_dict = validate_variant_types(
+        variant_scores_dict, expected_variant_types
     )
-    if len(baseline_scores) == 0 or len(other_scores) == 0:
+    if any(len(variant_scores_dict[kk]) == 0 for kk in expected_variant_types):
         # no comparison can be made since there is not at least one score per type
         return np.nan
-    baseline_mean = mean(baseline_scores)
-    other_mean = mean(other_scores)
+    # first key should be the baseline category
+    baseline_mean = mean(variant_scores_dict[expected_variant_types[0]])
+    other_mean = mean(variant_scores_dict[expected_variant_types[1]])
     return np.nan if baseline_mean == 0 else 1 - other_mean / baseline_mean
 
 
-def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=False):
+def normalized_cohens_h(
+    variant_scores_dict: dict, expected_variant_types=None, interpret=False
+):
     """Cohen's h effect size between two proportions, normalized to interval [-1,1].
 
     Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
@@ -2165,24 +2186,30 @@ def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=Fal
         - a medium difference if 0.15915494 <= |norm h| < 0.25464791
         - a large difference if 0.25464791 <= |norm h|
     Args:
-        baseline_scores: a list of scores on baseline instances.
-        other_scores: a list of scores on instances that will be compared to the baseline.
+        variant_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
+            of instance scores corresponding to variant_types with that key
+        expected_variant_types: tuple of the expected labels in variant_scores_dict (the first should be the
+            baseline type, the second the other).
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score between -1 and 1, and a string interpretation if interpret=True
     """
-    baseline_scores, other_scores = validate_baseline_other_aggregation(
-        baseline_scores, other_scores
+    if expected_variant_types is None:
+        expected_variant_types = ["original", "paraphrase"]
+    assert len(expected_variant_types) == 2
+    variant_scores_dict = validate_variant_types(
+        variant_scores_dict, expected_variant_types
     )
-    if len(baseline_scores) == 0 or len(other_scores) == 0:
+    if any(len(variant_scores_dict[kk]) == 0 for kk in expected_variant_types):
         # no comparison can be made since there is not at least one score per type
         return np.nan
-    for score_list in zip(baseline_scores, other_scores):
+    # requires scores to be in [0,1]
+    for kk, score_list in variant_scores_dict.items():
         assert all(
             0 <= score <= 1 for score in score_list
-        ), "all scores must be in [0,1]"
-    baseline_mean = mean(baseline_scores)
-    other_mean = mean(other_scores)
+        ), f"all {kk} scores must be in [0,1]"
+    baseline_mean = mean(variant_scores_dict[expected_variant_types[0]])
+    other_mean = mean(variant_scores_dict[expected_variant_types[1]])
     h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
     norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
     if not interpret:
@@ -2199,46 +2226,46 @@ def normalized_cohens_h(baseline_scores: List, other_scores: List, interpret=Fal
     return norm_h, how_signif[0]
 
 
-def mean_baseline_score(baseline_scores: List, other_scores: List):
-    """Return average score on the baseline only.
+def mean_variant_score(
+    variant_scores_dict: dict, expected_variant_type: str = "original"
+):
+    """Return the mean instance score for a single type of variant (not a comparison).
 
     Args:
-        baseline_scores: a list of scores on baseline instances.
-        other_scores: a list of scores on instances that will be compared to the baseline.
+        variant_scores_dict: dict where keys are variant types and values are lists of instance scores.
+        expected_variant_type: the key (variant type) for which the average will be computed
 
     Returns:
-        float value
+        float score
     """
-    baseline_scores, other_scores = validate_baseline_other_aggregation(
-        baseline_scores, other_scores
+    variant_scores_dict = validate_variant_types(
+        variant_scores_dict, [expected_variant_type]
     )
-    if len(baseline_scores) == 0:
+    score_list = variant_scores_dict[expected_variant_type]
+    if len(score_list) == 0:
         # no scores to use
         return np.nan
-    assert all(
-        0 <= score <= 1 for score in baseline_scores
-    ), "all scores must be in [0,1]"
-    return mean(baseline_scores)
+    return mean(score_list)
 
 
-def mean_others_score(baseline_scores: List, other_scores: List):
-    """Return average score on the others only.
+def mean_original_score(variant_scores_dict: dict):
+    """Return average score on the baseline only.
 
     Args:
-        baseline_scores: a list of scores on baseline instances.
-        other_scores: a list of scores on instances that will be compared to the baseline.
+        variant_scores_dict: dict where one key should be 'original' and the values a list of
+            original instance scores
+    """
+    return mean_variant_score(variant_scores_dict, "original")
 
-    Returns:
-        float value
+
+def mean_paraphrase_score(variant_scores_dict: dict):
+    """Return average score on the paraphrases only.
+
+    Args:
+        variant_scores_dict: dict where one key should be 'paraphrase' and the values a list of
+            original instance scores
     """
-    baseline_scores, other_scores = validate_baseline_other_aggregation(
-        baseline_scores, other_scores
-    )
-    if len(other_scores) == 0:
-        # no scores to use
-        return np.nan
-    assert all(0 <= score <= 1 for score in other_scores), "all scores must be in [0,1]"
-    return mean(other_scores)
+    return mean_variant_score(variant_scores_dict, "paraphrase")
 
 
 # metrics using mean reduction
@@ -2265,15 +2292,15 @@ class FixedGroupMeanStringContainment(StringContainment):
 class FixedGroupMeanBaselineAccuracy(Accuracy):
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_baseline", mean_baseline_score, True],
+            "agg_func": ["mean_baseline", mean_original_score, True],
         }
     }
 
 
-class FixedGroupMeanOthersAccuracy(Accuracy):
+class FixedGroupMeanParaphraseAccuracy(Accuracy):
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_others", mean_others_score, True],
+            "agg_func": ["mean_paraphrase", mean_paraphrase_score, True],
         }
     }
 
@@ -2282,15 +2309,15 @@ class FixedGroupMeanOthersAccuracy(Accuracy):
 class FixedGroupMeanBaselineStringContainment(StringContainment):
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_baseline", mean_baseline_score, True],
+            "agg_func": ["mean_baseline", mean_original_score, True],
         }
     }
 
 
-class FixedGroupMeanOthersStringContainment(StringContainment):
+class FixedGroupMeanParaphraseStringContainment(StringContainment):
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_others", mean_others_score, True],
+            "agg_func": ["mean_paraphrase", mean_paraphrase_score, True],
         }
     }
 
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 3932540b4d..fddbd6d24e 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -15,8 +15,8 @@
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
-    FixedGroupMeanOthersAccuracy,
-    FixedGroupMeanOthersStringContainment,
+    FixedGroupMeanParaphraseAccuracy,
+    FixedGroupMeanParaphraseStringContainment,
     FixedGroupMeanStringContainment,
     FixedGroupNormCohensHAccuracy,
     FixedGroupNormCohensHStringContainment,
@@ -78,25 +78,23 @@
     + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
 )
 
-# for group_mean aggregations with a subgroup_comparison, add a baseline indicator
+# for group_mean aggregations with a subgroup_comparison, add a variant_type label
 # these groupings correspond in length to the group identifiers above
-IS_BASELINE = np.concatenate(
-    (
-        np.repeat(a=[True, False], repeats=[1, 4]),
-        np.repeat(a=[True, False], repeats=[1, 4]),
-        np.repeat(a=[True, False], repeats=[1, 3]),
-        np.repeat(a=[True, False], repeats=[1, 0]),
-    )
+VARIANT_TYPE = np.concatenate(
+    [
+        np.repeat(a=["original", "paraphrase"], repeats=reps)
+        for reps in [[1, 4], [1, 4], [1, 3], [1, 0]]
+    ]
 ).tolist()
 
 # construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
 group_by_fields = ["group", "id"]
 
-for ai, ib in zip(GROUPED_INSTANCE_ADDL_INPUTS, IS_BASELINE):
+for ai, vt in zip(GROUPED_INSTANCE_ADDL_INPUTS, VARIANT_TYPE):
     ai.update(
         {
             "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "is_baseline": ib,
+            "variant_type": vt,
         }
     )
 
@@ -478,9 +476,9 @@ def test_grouped_instance_metrics(self):
             FixedGroupMeanStringContainment(),
             GroupMeanStringContainment(),
             FixedGroupMeanBaselineAccuracy(),
-            FixedGroupMeanOthersAccuracy(),
+            FixedGroupMeanParaphraseAccuracy(),
             FixedGroupMeanBaselineStringContainment(),
-            FixedGroupMeanOthersStringContainment(),
+            FixedGroupMeanParaphraseStringContainment(),
             GroupMeanTokenOverlap(),
             FixedGroupNormCohensHAccuracy(),
             FixedGroupNormCohensHStringContainment(),
@@ -700,7 +698,7 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupMeanOthersAccuracy(),
+            metric=FixedGroupMeanParaphraseAccuracy(),
             expected_ci_low=0.0,
             expected_ci_high=0.3333333333333333,
         )
@@ -712,7 +710,7 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupMeanOthersStringContainment(),
+            metric=FixedGroupMeanParaphraseStringContainment(),
             expected_ci_low=0.5,
             expected_ci_high=0.6666666666666666,
         )

From 94d05285cb7d94ed0300c1e78313e229ba822f4b Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 5 Feb 2024 16:32:30 +0200
Subject: [PATCH 48/83] initial commit

---
 .../robustness/fixed_group_mean_paraphrase_accuracy.json       | 3 +++
 .../fixed_group_mean_paraphrase_string_containment.json        | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_accuracy.json
new file mode 100644
index 0000000000..2ae1094239
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_string_containment.json
new file mode 100644
index 0000000000..cdc74c39e9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_mean_paraphrase_string_containment"
+}

From 5747630edd9653587e7eb6e9428c5d0287986e1f Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 5 Feb 2024 16:40:26 +0200
Subject: [PATCH 49/83] fix type hint in validate_variant_types

---
 src/unitxt/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index c309448e47..1371f87b6f 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2101,7 +2101,7 @@ def should_ignore_element(self, element, additional_input):
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
 def validate_variant_types(
-    variant_scores_dict: dict[List], expected_variant_types: None
+    variant_scores_dict: Dict[List], expected_variant_types: None
 ):
     """Validate a dict of variant type instance score lists.
 

From 1ca53170fa6834b910d2c02063af9852e4ce50b6 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 5 Feb 2024 16:47:24 +0200
Subject: [PATCH 50/83] fix type hint in validate_variant_types

---
 src/unitxt/metrics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 1371f87b6f..05e66f78d4 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2101,7 +2101,7 @@ def should_ignore_element(self, element, additional_input):
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
 def validate_variant_types(
-    variant_scores_dict: Dict[List], expected_variant_types: None
+    variant_scores_dict: Dict[str, List], expected_variant_types: List[str]
 ):
     """Validate a dict of variant type instance score lists.
 
@@ -2120,6 +2120,8 @@ def validate_variant_types(
             for kk, vv in variant_scores_dict.items()
         }
     )
+    if expected_variant_types is None:
+        expected_variant_types = [""]
     # make sure the expected types appear
     variant_scores_dict.update(
         {kk: [] for kk in expected_variant_types if kk not in variant_scores_dict}

From bdb8b04f25c23a1a1fb7c0aab44b3ef41033e373 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 5 Feb 2024 17:13:33 +0200
Subject: [PATCH 51/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json

metric was renamed
---
 .../metrics/robustness/fixed_group_mean_others_accuracy.json   | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
deleted file mode 100644
index 2ae1094239..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_mean_paraphrase_accuracy"
-}

From 101c08d2ce08083d5c2b42fe0cfbfbf37ed93504 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 5 Feb 2024 17:13:48 +0200
Subject: [PATCH 52/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json

metric was renamed
---
 .../robustness/fixed_group_mean_others_string_containment.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json
deleted file mode 100644
index 08ad6bca43..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_mean_others_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_mean_others_string_containment"
-}

From 92ae38eab154569a14a92e01242ebda4e1c31e93 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 11 Feb 2024 13:06:34 +0200
Subject: [PATCH 53/83] initial commit

---
 .../metrics/robustness/fixed_group_cohens_d_accuracy.json      | 3 +++
 .../robustness/fixed_group_cohens_d_string_containment.json    | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json
new file mode 100644
index 0000000000..e60abead20
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_cohens_d_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json
new file mode 100644
index 0000000000..e68f00a376
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_cohens_d_string_containment"
+}

From 0278f87731384b8fdc339f23e9600b2fe66a66d4 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 11 Feb 2024 13:12:03 +0200
Subject: [PATCH 54/83] implement PR changes; rename variant to subgroup; add
 Cohen's d metric

---
 prepare/metrics/grouped_instance_metrics.py |  57 +++
 src/unitxt/metrics.py                       | 437 +++++++++++++-------
 tests/test_metrics.py                       |  18 +
 3 files changed, 374 insertions(+), 138 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index c2aca38f20..fa232bfa01 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -4,6 +4,8 @@
 
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
+    FixedGroupCohensDAccuracy,
+    FixedGroupCohensDStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -437,6 +439,61 @@
     overwrite=True,
 )
 
+
+# Cohen's D will always use fixed groups
+metric = FixedGroupCohensDAccuracy()
+global_target = {
+    "fixed_group_cohens_d_accuracy": -333.55,
+    "score": -333.55,
+    "score_name": "fixed_group_cohens_d_accuracy",
+    "score_ci_low": -1000.0,
+    "score_ci_high": 0.5,
+    "fixed_group_cohens_d_accuracy_ci_low": -1000.0,
+    "fixed_group_cohens_d_accuracy_ci_high": 0.5,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_cohens_d_accuracy", overwrite=True
+)
+
+
+metric = FixedGroupCohensDStringContainment()
+global_target = {
+    "fixed_group_cohens_d_string_containment": -0.77,
+    "score": -0.77,
+    "score_name": "fixed_group_cohens_d_string_containment",
+    "score_ci_low": -0.87,
+    "score_ci_high": -0.58,
+    "fixed_group_cohens_d_string_containment_ci_low": -0.87,
+    "fixed_group_cohens_d_string_containment_ci_high": -0.58,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.robustness.fixed_group_cohens_d_string_containment",
+    overwrite=True,
+)
+
 # TokenOverlap: example of a metric that has more than one score
 
 global_target = {
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 05e66f78d4..aa685654f4 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -139,16 +139,15 @@ def score_based_confidence_interval(
 
         ci_score_prefix = str(ci_score_prefix)
         if aggregation_func is None:
+            # if aggregation_func is None, we simply take the mean of the resampled instance scores
+            # otherwise, the aggregation_func needs to be applied AFTER resampling the instances;
+            #   that is, re-form the groups, calculate the function, and take the mean of the group scores
             aggregation_func = self.average_item_scores
         for score_name in score_names:
             # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
-
-                # if aggregation_func is None, we simply take the mean of the resampled instance scores
-                # otherwise, the aggregation_func needs to be applied AFTER resampling the instances;
-                #   that is, re-form the groups, calculate the function, and take the mean of the group scores
                 scores = numpy.apply_along_axis(
                     lambda resampled_instances: aggregation_func(
                         resampled_instances, score_name
@@ -448,7 +447,8 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     """
 
     n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
-
+    # column required to be in additional_inputs if group_mean aggregation function requires a dict input of labels and their lists of scores
+    subgroup_column = None
     implemented_reductions: List[str] = field(
         default_factory=lambda: ["mean", "group_mean"]
     )
@@ -458,14 +458,6 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     def reduction_map(self) -> dict:
         pass
 
-    @staticmethod
-    def _function_uses_variant_types(aggregation_func):
-        import inspect
-
-        func_args = list(inspect.signature(aggregation_func).parameters.keys())
-        # if function has this argument, assume it does comparison
-        return "variant_scores_dict" in func_args
-
     def _validate_group_mean_reduction(self, instances: List[dict]):
         """Ensure that group_mean reduction_map is properly formatted.
 
@@ -474,7 +466,7 @@ def _validate_group_mean_reduction(self, instances: List[dict]):
         class GroupVarianceAccuracy(Accuracy):
             reduction_map = {'group_mean': {'agg_func': ['variance', np.var, True]}}
 
-        reduction_map must be a dict with
+        reduction_map must be a dict with values containing
         - an 'agg_func' field with value being a 3-element list where
             - 1st element is a string name of the aggregation function (used in naming the CI report)
             - 2nd element is the callable aggregation function
@@ -490,22 +482,23 @@ class GroupVarianceAccuracy(Accuracy):
         The aggregation function (2nd element of agg_func) can be one of two types:
         1. simple: calculate a summary statistic from a single group of values (e.g. mean, median, etc.).
             This is best suited for cases where the instances are independent of each other, other than belonging to the same group
-        2. comparison: requires additional_inputs to have a boolean key 'variant_type'.  This function conducts
-            a comparison between scores for differing variant_types (e.g., 'original' vs 'paraphrase').
-            An example is where the baseline instance is a question, and the others are various paraphrases
+        2. comparison: requires subgroup_column to be specified.  This function conducts
+            a comparison between scores for differing values of subgroup_column (e.g., 'original' vs 'paraphrase').
+            An example is where the original instance is a question, and the others are various paraphrases
             or perturbations of this question.  Here, the function would return, say, a comparison of the instance accuracies
-            rather than, say, the average instance accuracy.  It requires an argument variant_scores_dict.
+            rather than, say, the average instance accuracy.
             In these cases, we recommend setting the 3rd parameter to be True so that the groups are resampled together.
 
         Example:
             class GroupVsBaselineDiffAccuracy(Accuracy):
+                subgroup_column = 'variant_type'
                 reduction_map = {'group_mean': {'agg_func': ['accuracy_diff', accuracy_diff, True],}}
 
             # where the function is defined as
-            def accuracy_diff(variant_scores_dict, expected_variant_types=['original', 'paraphrase']):
-                validate_variant_types(variant_scores_dict, expected_variant_types)
+            def accuracy_diff(subgroup_scores_dict, expected_subgroup_types=['original', 'paraphrase']):
+                validate_subgroup_types(subgroup_scores_dict, expected_subgroup_types)
                 from statistics import mean
-                return mean(variant_scores_dict['paraphrase']) - mean(variant_scores_dict['original'])
+                return mean(subgroup_scores_dict['paraphrase']) - mean(subgroup_scores_dict['original'])
             The input dataset should look like:
 
             'group_id'  'question'                                   'variant_type'
@@ -554,12 +547,13 @@ def accuracy_diff(variant_scores_dict, expected_variant_types=['original', 'para
         if "score_fields" in fields:
             assert isinstance(fields["score_fields"], list)
 
-        # for aggregations that conduct a comparison, expects a 'variant_type' field
-        if self._function_uses_variant_types(fields["agg_func"][1]):
+        # for aggregation functions that use the subgroup_column (expect a dict of lists), check that
+        # this field exists
+        if self.subgroup_column is not None:
             assert all(
-                "variant_type" in instance["additional_inputs"]
+                self.subgroup_column in instance["additional_inputs"]
                 for instance in instances
-            ), f"since group_mean aggregation function {fields['agg_func'][1]} performs a comparison, each instance's additional_inputs dict must have a key variant_type"
+            ), f"each instance additional_inputs dict must have a key {self.subgroup_column}"
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
@@ -575,7 +569,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             aggregation_function = self.average_item_scores
             if reduction_type == "mean":
                 reduction_fields = list(set(reduction_params))
-                # extract only the dict of instance scores
+                # no group reduction, so resample instances individually
                 scores_to_resample = deepcopy(instances)
             elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction(instances=instances)
@@ -587,6 +581,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 aggregation_function_name = str(reduction_params["agg_func"][0])
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
                 if reduction_params["agg_func"][2]:
+                    # append fixed_ to name because resamples the groups as fixed units
                     field_name_full_prefix = "fixed_" + field_name_full_prefix
                 (
                     scores_to_resample,
@@ -665,38 +660,39 @@ def get_group_scores(
         Args:
             instances: List of observation instances with instance-level scores (fields) computed.
             field_names: List of instance score names in each instance to apply the aggregation function.
-            group_aggregation_func: Callable aggregation function accepting a list of numeric scores, or a dict of variant types scores,
-                and returning a single score.
+            group_aggregation_func: Callable aggregation function accepting a list of numeric scores;
+                or, if self.subgroup_column is not None, a dict of subgroup types scores by subgroup_column value.
+                callable function returns a single score for the group
 
         Returns:
-            List of dicts, each corresponding to a group of instances (defined by grouping_field),
+            List of dicts, each corresponding to a group of instances (defined by 'group_id'),
                 with a group score for each field_name
         """
         from collections import defaultdict
 
         # three-level defaultdict:
-        # first is the grouping, second is the field name, the third is the variant_type (by default 'original')
+        # first is the grouping, second is the field name, the third is the subgroup_type (by default 'default')
         group_to_instance_scores = defaultdict(
             lambda: defaultdict(lambda: defaultdict(list))
         )
 
-        # check if function has fields for variant_type
-        uses_variant_type = self._function_uses_variant_types(group_aggregation_func)
-        if uses_variant_type:
+        # check if function has fields for subgroup_column
+        uses_subgroups = self.subgroup_column is not None
+        if uses_subgroups:
             assert all(
-                "variant_type" in instance["additional_inputs"]
+                self.subgroup_column in instance["additional_inputs"]
                 for instance in instances
-            ), "all instances must have field 'variant_type' in additional_inputs"
+            ), f"all instances must have field {self.subgroup_column}' in additional_inputs"
 
             # define the aggregation function
-            def agg_func(variant_scores_dict):
-                # if function a uses the variant types, pass the full dict
-                return group_aggregation_func(variant_scores_dict=variant_scores_dict)
+            def agg_func(subgroup_scores_dict):
+                # if function a uses the subgroup_column values, pass the full dict
+                return group_aggregation_func(subgroup_scores_dict)
         else:
 
-            def agg_func(variant_scores_dict):
+            def agg_func(subgroup_scores_dict):
                 # otherwise pass the default 'original' scores to the default argument
-                return group_aggregation_func(variant_scores_dict["original"])
+                return group_aggregation_func(subgroup_scores_dict["default"])
 
         # loop through the instances and group the scores
         for instance in instances:
@@ -707,12 +703,13 @@ def agg_func(variant_scores_dict):
                     f"This field is required for group based metric computation."
                 )
             group_key = additional_inputs["group_id"]
-            # for functions that do comparisons between variant_type gorups
-            variant_type = (
-                additional_inputs["variant_type"] if uses_variant_type else "original"
+            # for functions that do comparisons between subgroup_column groups
+            # if function doesn't use subgroup_column, or none is present, set "default" as default value, and pass all scores
+            subgroup_type = (
+                additional_inputs[self.subgroup_column] if uses_subgroups else "default"
             )
             for field_name in field_names:
-                group_to_instance_scores[group_key][field_name][variant_type].append(
+                group_to_instance_scores[group_key][field_name][subgroup_type].append(
                     instance["score"]["instance"][field_name]
                 )
 
@@ -722,22 +719,23 @@ def agg_func(variant_scores_dict):
                 "score": {
                     "instance": {
                         field_name: agg_func(score_dict)
-                        for field_name, score_dict in scores.items()
+                        for field_name, score_dict in group_scores.items()
                     }
                 }
             }
-            for scores in group_to_instance_scores.values()
+            for group_scores in group_to_instance_scores.values()
         ]
 
     def _set_up_group_mean_aggregation(
         self, instances, reduction_params, reduction_fields
     ):
+        group_aggregation_func = reduction_params["agg_func"][1]
         # if treat groups as units
         if reduction_params["agg_func"][2]:
             # pass the group aggregate---not instance---scores to resample as usual
             aggregation_function = self.average_item_scores
             scores_to_resample = self.get_group_scores(
-                instances, reduction_fields, reduction_params["agg_func"][1]
+                instances, reduction_fields, group_aggregation_func
             )
         else:
             # pass the instance scores to resample, and calculate the group aggregation on the resamplings
@@ -746,7 +744,7 @@ def _set_up_group_mean_aggregation(
             def aggregation_function(
                 instances,
                 field_name,
-                group_aggregation_func=reduction_params["agg_func"][1],
+                group_aggregation_func=group_aggregation_func,
             ):
                 group_scores = self.get_group_scores(
                     instances, [field_name], group_aggregation_func
@@ -2100,44 +2098,46 @@ def should_ignore_element(self, element, additional_input):
 
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
-def validate_variant_types(
-    variant_scores_dict: Dict[str, List], expected_variant_types: List[str]
+def validate_subgroup_types(
+    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
 ):
-    """Validate a dict of variant type instance score lists.
+    """Validate a dict of subgroup type instance score lists.
 
     Args:
-        variant_scores_dict: dict where keys are variant types and values are lists of instance scores.
-        expected_variant_types: list of the variant types which should exist in variant_scores_dict, so
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        expected_subgroup_types: list of the subgroup types which should exist in subgroup_scores_dict, so
             that another function that receives it as input will valid.
 
     Returns:
         dict with all NaN scores removed; any expected keys that are missing have an empty list inserted
     """
     # remove any NaNs
-    variant_scores_dict.update(
+    subgroup_scores_dict.update(
         {
             kk: [vvv for vvv in vv if not np.isnan(vvv)]
-            for kk, vv in variant_scores_dict.items()
+            for kk, vv in subgroup_scores_dict.items()
         }
     )
-    if expected_variant_types is None:
-        expected_variant_types = [""]
+    if expected_subgroup_types is None:
+        expected_subgroup_types = []
     # make sure the expected types appear
-    variant_scores_dict.update(
-        {kk: [] for kk in expected_variant_types if kk not in variant_scores_dict}
+    subgroup_scores_dict.update(
+        {kk: [] for kk in expected_subgroup_types if kk not in subgroup_scores_dict}
     )
-    return variant_scores_dict
+    return subgroup_scores_dict
 
 
-def performance_drop_rate(variant_scores_dict: dict, expected_variant_types=None):
+def performance_drop_rate(
+    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
+):
     """Percentage decrease of mean performance on test elements relative to that on a baseline.
 
     from https://arxiv.org/pdf/2306.04528.pdf.
 
     Args:
-        variant_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
-            of instance scores corresponding to variant_types with that key
-        expected_variant_types: tuple of the expected labels in variant_scores_dict (the first should be the
+        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
+            of instance scores corresponding to subgroup_types with that key
+        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
             baseline type, the second the other).
 
     Returns:
@@ -2146,23 +2146,64 @@ def performance_drop_rate(variant_scores_dict: dict, expected_variant_types=None
         otherwise, calculate PDR
 
     """
-    if expected_variant_types is None:
-        expected_variant_types = ["original", "paraphrase"]
-    assert len(expected_variant_types) == 2
-    variant_scores_dict = validate_variant_types(
-        variant_scores_dict, expected_variant_types
+    assert len(expected_subgroup_types) == 2
+    subgroup_scores_dict = validate_subgroup_types(
+        subgroup_scores_dict, expected_subgroup_types
     )
-    if any(len(variant_scores_dict[kk]) == 0 for kk in expected_variant_types):
+    if any(len(subgroup_scores_dict[kk]) == 0 for kk in expected_subgroup_types):
         # no comparison can be made since there is not at least one score per type
         return np.nan
     # first key should be the baseline category
-    baseline_mean = mean(variant_scores_dict[expected_variant_types[0]])
-    other_mean = mean(variant_scores_dict[expected_variant_types[1]])
+    baseline_mean = mean(subgroup_scores_dict[expected_subgroup_types[0]])
+    other_mean = mean(subgroup_scores_dict[expected_subgroup_types[1]])
     return np.nan if baseline_mean == 0 else 1 - other_mean / baseline_mean
 
 
+def interpret_cohens_effect_size(x: float):
+    """Return a string interpretation of a Cohen effect size value.
+
+    See https://en.wikipedia.org/wiki/Effect_size;
+    Cohen, Jacob (1988). Statistical Power Analysis for the Behavioral Sciences; and
+    Sawilowsky, S (2009). "New effect size rules of thumb". Journal of Modern Applied Statistical Methods. 8 (2): 467-474.
+
+    Value has interpretation of
+    - essentially 0 if |x| < 0.01
+    - very small if 0.01 <= |x| < 0.2
+    - small difference if 0.2 <= |x| < 0.5
+    - a medium difference if 0.5 <= |x| < 0.8
+    - a large difference if 0.8 <= |x| < 1.2
+    - a very large difference if 1.2 <= |x| < 2.0
+    - a huge difference if 2.0 <= |x|
+
+    Args:
+        x: float effect size value
+
+    Returns:
+        string interpretation
+    """
+    import pandas as pd
+
+    # assign a label according to threshold of the absolute value
+    return pd.cut(
+        x=[np.abs(x)],
+        right=False,
+        bins=[-1, 0.01, 0.2, 0.5, 0.8, 1.2, 2.0, np.Inf],
+        labels=[
+            "essentially zero",
+            "very small",
+            "small",
+            "medium",
+            "large",
+            "very large",
+            "huge",
+        ],
+    )[0]
+
+
 def normalized_cohens_h(
-    variant_scores_dict: dict, expected_variant_types=None, interpret=False
+    subgroup_scores_dict: Dict[str, List],
+    expected_subgroup_types: List[str],
+    interpret=False,
 ):
     """Cohen's h effect size between two proportions, normalized to interval [-1,1].
 
@@ -2175,99 +2216,122 @@ def normalized_cohens_h(
     Assumes the scores are in [0,1], either continuous or binary; hence taking the average of a group of scores yields a proportion..
     Calculates the change in the average of the other_scores relative to the average of the baseline_scores.    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
 
-    Interpretation: the original unscaled Cohen's h can be interpreted as
-        - no difference if |h| = 0
-        - an insignificant difference if 0 < |h| < 0.2
-        - small difference if 0.2 <= |h| < 0.5
-        - a medium difference if 0.5 <= |h| < 0.8
-        - a large difference if 0.8 <= |h|
+    Interpretation: the original unscaled Cohen's h can be interpreted according to function interpret_effect_size
+
     Thus, the rule of interpreting the effect of the normalized value is to use the same thresholds divided by pi
-        - no difference if |norm h| = 0
-        - an insignificant difference if 0 < |norm h| < 0.06366198
+        - essentially 0 if |norm h| < 0.0031831
+        - very small if 0.0031831 <= |norm h| < 0.06366198
         - small difference if 0.06366198 <= |norm h| < 0.15915494
         - a medium difference if 0.15915494 <= |norm h| < 0.25464791
-        - a large difference if 0.25464791 <= |norm h|
+        - a large difference if 0.25464791 <= |norm h| < 0.38197186
+        - a very large difference if 0.38197186 <= |norm h| < 0.63661977
+        - a huge difference if 0.63661977 <= |norm h|
     Args:
-        variant_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
-            of instance scores corresponding to variant_types with that key
-        expected_variant_types: tuple of the expected labels in variant_scores_dict (the first should be the
+        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
+            of instance scores corresponding to subgroup_types with that key
+        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
             baseline type, the second the other).
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score between -1 and 1, and a string interpretation if interpret=True
     """
-    if expected_variant_types is None:
-        expected_variant_types = ["original", "paraphrase"]
-    assert len(expected_variant_types) == 2
-    variant_scores_dict = validate_variant_types(
-        variant_scores_dict, expected_variant_types
+    assert len(expected_subgroup_types) == 2
+    subgroup_scores_dict = validate_subgroup_types(
+        subgroup_scores_dict, expected_subgroup_types
     )
-    if any(len(variant_scores_dict[kk]) == 0 for kk in expected_variant_types):
+    if any(len(subgroup_scores_dict[kk]) == 0 for kk in expected_subgroup_types):
         # no comparison can be made since there is not at least one score per type
         return np.nan
     # requires scores to be in [0,1]
-    for kk, score_list in variant_scores_dict.items():
+    for kk, score_list in subgroup_scores_dict.items():
         assert all(
             0 <= score <= 1 for score in score_list
         ), f"all {kk} scores must be in [0,1]"
-    baseline_mean = mean(variant_scores_dict[expected_variant_types[0]])
-    other_mean = mean(variant_scores_dict[expected_variant_types[1]])
+
+    baseline_mean = mean(subgroup_scores_dict[expected_subgroup_types[0]])
+    other_mean = mean(subgroup_scores_dict[expected_subgroup_types[1]])
     h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
     norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
     if not interpret:
         return norm_h
 
-    import pandas as pd
+    return norm_h, interpret_cohens_effect_size(h)
 
-    how_signif = pd.cut(
-        x=[np.abs(h)],
-        right=False,
-        bins=[-1, 0.2, 0.5, 0.8, np.Inf],
-        labels=["not significant", "small", "medium", "large"],
-    )
-    return norm_h, how_signif[0]
 
-
-def mean_variant_score(
-    variant_scores_dict: dict, expected_variant_type: str = "original"
+def cohens_d(
+    subgroup_scores_dict: Dict[str, List],
+    expected_subgroup_types: List[str],
+    interpret=False,
 ):
-    """Return the mean instance score for a single type of variant (not a comparison).
+    """Cohen's d effect size between mean of two samples.
 
-    Args:
-        variant_scores_dict: dict where keys are variant types and values are lists of instance scores.
-        expected_variant_type: the key (variant type) for which the average will be computed
+    Takes into account the variances within the samples, not just the means.
 
+    Args:
+        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
+            of instance scores corresponding to subgroup_types with that key
+        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
+            baseline type, the second the other).
+        interpret: boolean, whether to interpret the significance of the score or not
     Returns:
-        float score
+        float score, and a string interpretation if interpret=True
     """
-    variant_scores_dict = validate_variant_types(
-        variant_scores_dict, [expected_variant_type]
+    assert len(expected_subgroup_types) == 2
+    subgroup_scores_dict = validate_subgroup_types(
+        subgroup_scores_dict, expected_subgroup_types
     )
-    score_list = variant_scores_dict[expected_variant_type]
-    if len(score_list) == 0:
-        # no scores to use
+    group_n = [len(subgroup_scores_dict[st]) for st in expected_subgroup_types]
+    if not any(nn > 1 for nn in group_n):
+        # if at least one sample size is 0 for one type, no comparison can be made at at all
+        # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
+        # so require at least one sample to have > 1 observation, and both to have >= 1.
         return np.nan
-    return mean(score_list)
-
 
-def mean_original_score(variant_scores_dict: dict):
-    """Return average score on the baseline only.
+    # otherwise, calculate the variances
+    group_mean = [mean(subgroup_scores_dict[st]) for st in expected_subgroup_types]
+    # sample variance with 1 degree of freedom (denominator n-1); if n=1, return 0 since otherwise throws an error
+    group_var = [
+        0.0 if nn == 1 else np.var(subgroup_scores_dict[st], ddof=1)
+        for st, nn in zip(expected_subgroup_types, group_n)
+    ]
+    var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
+    pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
+    d = np.diff(group_mean)[0] / pooled_sd
+    # clip it at a very large value so it doesn't become infinite if the variance (denominator) is 0
+    d = np.clip(a=d, a_min=-1000, a_max=1000)
 
-    Args:
-        variant_scores_dict: dict where one key should be 'original' and the values a list of
-            original instance scores
-    """
-    return mean_variant_score(variant_scores_dict, "original")
+    if not interpret:
+        return d
+    return d, interpret_cohens_effect_size(d)
 
 
-def mean_paraphrase_score(variant_scores_dict: dict):
-    """Return average score on the paraphrases only.
+def mean_subgroup_score(
+    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
+):
+    """Return the mean instance score for a subset (possibly a single type) of variants (not a comparison).
 
     Args:
-        variant_scores_dict: dict where one key should be 'paraphrase' and the values a list of
-            original instance scores
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        expected_subgroup_types: the keys (subgroup types) for which the average will be computed
+
+    Returns:
+        float score
     """
-    return mean_variant_score(variant_scores_dict, "paraphrase")
+    subgroup_scores_dict = validate_subgroup_types(
+        subgroup_scores_dict, expected_subgroup_types
+    )
+    from itertools import chain
+
+    # combine all desired subgroup scores
+    score_list = list(
+        chain.from_iterable(
+            [subgroup_scores_dict[st] for st in expected_subgroup_types]
+        )
+    )
+    if len(score_list) == 0:
+        # no scores to use
+        return np.nan
+    return mean(score_list)
 
 
 # metrics using mean reduction
@@ -2292,51 +2356,99 @@ class FixedGroupMeanStringContainment(StringContainment):
 
 # take only the (fixed) group mean of baseline or other (paraphrases) scores
 class FixedGroupMeanBaselineAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    # take mean of "original" variants only
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_baseline", mean_original_score, True],
+            "agg_func": [
+                "mean_baseline",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, expected_subgroup_types=["original"]
+                ),
+                True,
+            ],
         }
     }
 
 
 class FixedGroupMeanParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    # take mean of "paraphrase" variants only
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_paraphrase", mean_paraphrase_score, True],
+            "agg_func": [
+                "mean_paraphrase",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, expected_subgroup_types=["paraphrase"]
+                ),
+                True,
+            ],
         }
     }
 
 
 # same as above but using StringContainment
 class FixedGroupMeanBaselineStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    # take mean of "original" variants only
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_baseline", mean_original_score, True],
+            "agg_func": [
+                "mean_baseline",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, expected_subgroup_types=["original"]
+                ),
+                True,
+            ],
         }
     }
 
 
 class FixedGroupMeanParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    # take mean of "paraphrase" variants only
     reduction_map = {
         "group_mean": {
-            "agg_func": ["mean_paraphrase", mean_paraphrase_score, True],
+            "agg_func": [
+                "mean_paraphrase",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, expected_subgroup_types=["paraphrase"]
+                ),
+                True,
+            ],
         }
     }
 
 
 # using PDR
 class FixedGroupPDRAccuracy(Accuracy):
+    subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
-            "agg_func": ["pdr", performance_drop_rate, True],
+            "agg_func": [
+                "pdr",
+                lambda scd: performance_drop_rate(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
         }
     }
 
 
 class FixedGroupPDRStringContainment(StringContainment):
+    subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
-            "agg_func": ["pdr", performance_drop_rate, True],
+            "agg_func": [
+                "pdr",
+                lambda scd: performance_drop_rate(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
         }
     }
 
@@ -2350,18 +2462,67 @@ class GroupMeanTokenOverlap(TokenOverlap):
     }
 
 
-# using Cohens's h
+# using Cohens's h for proportions
 class FixedGroupNormCohensHAccuracy(Accuracy):
+    subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
-            "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
+            "agg_func": [
+                "norm_cohens_h",
+                lambda scd: normalized_cohens_h(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
         }
     }
 
 
 class FixedGroupNormCohensHStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "norm_cohens_h",
+                lambda scd: normalized_cohens_h(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+
+
+# using Cohen's d (takes into account internal variation in group scores)
+class FixedGroupCohensDAccuracy(Accuracy):
+    subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
-            "agg_func": ["norm_cohens_h", normalized_cohens_h, True],
+            "agg_func": [
+                "cohens_d",
+                lambda scd: cohens_d(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+
+
+class FixedGroupCohensDStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "cohens_d",
+                lambda scd: cohens_d(
+                    subgroup_scores_dict=scd,
+                    expected_subgroup_types=["original", "paraphrase"],
+                ),
+                True,
+            ],
         }
     }
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index fddbd6d24e..b6acde64aa 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -12,6 +12,8 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
+    FixedGroupCohensDAccuracy,
+    FixedGroupCohensDStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -484,6 +486,8 @@ def test_grouped_instance_metrics(self):
             FixedGroupNormCohensHStringContainment(),
             FixedGroupPDRAccuracy(),
             FixedGroupPDRStringContainment(),
+            FixedGroupCohensDAccuracy(),
+            FixedGroupCohensDStringContainment(),
         ]
         global_targets = [
             0.225,
@@ -499,6 +503,8 @@ def test_grouped_instance_metrics(self):
             -0.4639421840102023,
             0.8333333333333334,
             0.4444444444444445,
+            -333.55156684612643,
+            -0.7698003589195009,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -741,6 +747,18 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=0.5,
         )
 
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupCohensDAccuracy(),
+            expected_ci_low=-1000.0,
+            expected_ci_high=0.5,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupCohensDStringContainment(),
+            expected_ci_low=-0.8660254037844387,
+            expected_ci_high=-0.5773502691896257,
+        )
+
         # pass global dict because there are additional fields other than the main score
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanTokenOverlap(),

From 1c6a25206bca24fab41952be39aabc7417e0d0c1 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Sun, 11 Feb 2024 18:06:57 +0200
Subject: [PATCH 55/83] correct condition on cohen's d sample sizes

---
 src/unitxt/metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 0b8e1932a4..c5d58eb82f 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2301,8 +2301,8 @@ def cohens_d(
         subgroup_scores_dict, expected_subgroup_types
     )
     group_n = [len(subgroup_scores_dict[st]) for st in expected_subgroup_types]
-    if not any(nn > 1 for nn in group_n):
-        # if at least one sample size is 0 for one type, no comparison can be made at at all
+    if any(nn == 0 for nn in group_n) or all(nn <= 1 for nn in group_n):
+        # if at least one sample size is 0 for one type, no comparison can be made at all
         # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
         # so require at least one sample to have > 1 observation, and both to have >= 1.
         return np.nan

From c7236f9c78871d58fee05bf4135c42c2d8f3ce0b Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 12 Feb 2024 12:34:47 +0200
Subject: [PATCH 56/83] adapt PDR, Cohens' D and H to accept a list of list of
 labels (so that a comparison group can consist of multiple sub-groups)

---
 prepare/metrics/grouped_instance_metrics.py |  96 ++++++------
 src/unitxt/metrics.py                       | 163 ++++++++++++--------
 tests/test_metrics.py                       |  40 ++---
 3 files changed, 171 insertions(+), 128 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index fa232bfa01..430c16f6db 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -4,18 +4,18 @@
 
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
-    FixedGroupCohensDAccuracy,
-    FixedGroupCohensDStringContainment,
+    FixedGroupCohensDParaphraseAccuracy,
+    FixedGroupCohensDParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
     FixedGroupMeanParaphraseAccuracy,
     FixedGroupMeanParaphraseStringContainment,
     FixedGroupMeanStringContainment,
-    FixedGroupNormCohensHAccuracy,
-    FixedGroupNormCohensHStringContainment,
-    FixedGroupPDRAccuracy,
-    FixedGroupPDRStringContainment,
+    FixedGroupNormCohensHParaphraseAccuracy,
+    FixedGroupNormCohensHParaphraseStringContainment,
+    FixedGroupPDRParaphraseAccuracy,
+    FixedGroupPDRParaphraseStringContainment,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
@@ -337,15 +337,15 @@
 
 
 # PDR: will always use fixed groups
-metric = FixedGroupPDRAccuracy()
+metric = FixedGroupPDRParaphraseAccuracy()
 global_target = {
-    "fixed_group_pdr_accuracy": 0.83,
+    "fixed_group_pdr_paraphrase_accuracy": 0.83,
     "score": 0.83,
-    "score_name": "fixed_group_pdr_accuracy",
+    "score_name": "fixed_group_pdr_paraphrase_accuracy",
     "score_ci_low": 0.67,
     "score_ci_high": 1.0,
-    "fixed_group_pdr_accuracy_ci_low": 0.67,
-    "fixed_group_pdr_accuracy_ci_high": 1.0,
+    "fixed_group_pdr_paraphrase_accuracy_ci_low": 0.67,
+    "fixed_group_pdr_paraphrase_accuracy_ci_high": 1.0,
 }
 
 
@@ -358,17 +358,19 @@
     additional_inputs=additional_inputs,
 )
 
-add_to_catalog(metric, "metrics.robustness.fixed_group_pdr_accuracy", overwrite=True)
+add_to_catalog(
+    metric, "metrics.robustness.fixed_group_pdr_paraphrase_accuracy", overwrite=True
+)
 
-metric = FixedGroupPDRStringContainment()
+metric = FixedGroupPDRParaphraseStringContainment()
 global_target = {
-    "fixed_group_pdr_string_containment": 0.44,
+    "fixed_group_pdr_paraphrase_string_containment": 0.44,
     "score": 0.44,
-    "score_name": "fixed_group_pdr_string_containment",
+    "score_name": "fixed_group_pdr_paraphrase_string_containment",
     "score_ci_low": 0.33,
     "score_ci_high": 0.5,
-    "fixed_group_pdr_string_containment_ci_low": 0.33,
-    "fixed_group_pdr_string_containment_ci_high": 0.5,
+    "fixed_group_pdr_paraphrase_string_containment_ci_low": 0.33,
+    "fixed_group_pdr_paraphrase_string_containment_ci_high": 0.5,
 }
 
 
@@ -382,19 +384,21 @@
 )
 
 add_to_catalog(
-    metric, "metrics.robustness.fixed_group_pdr_string_containment", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_pdr_paraphrase_string_containment",
+    overwrite=True,
 )
 
 # Cohen's H will always use fixed groups
-metric = FixedGroupNormCohensHAccuracy()
+metric = FixedGroupNormCohensHParaphraseAccuracy()
 global_target = {
-    "fixed_group_norm_cohens_h_accuracy": -0.42,
+    "fixed_group_norm_cohens_h_paraphrase_accuracy": -0.42,
     "score": -0.42,
-    "score_name": "fixed_group_norm_cohens_h_accuracy",
+    "score_name": "fixed_group_norm_cohens_h_paraphrase_accuracy",
     "score_ci_low": -1.0,
     "score_ci_high": 0.33,
-    "fixed_group_norm_cohens_h_accuracy_ci_low": -1.0,
-    "fixed_group_norm_cohens_h_accuracy_ci_high": 0.33,
+    "fixed_group_norm_cohens_h_paraphrase_accuracy_ci_low": -1.0,
+    "fixed_group_norm_cohens_h_paraphrase_accuracy_ci_high": 0.33,
 }
 
 
@@ -408,19 +412,21 @@
 )
 
 add_to_catalog(
-    metric, "metrics.robustness.fixed_group_norm_cohens_h_accuracy", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_norm_cohens_h_paraphrase_accuracy",
+    overwrite=True,
 )
 
 
-metric = FixedGroupNormCohensHStringContainment()
+metric = FixedGroupNormCohensHParaphraseStringContainment()
 global_target = {
-    "fixed_group_norm_cohens_h_string_containment": -0.46,
+    "fixed_group_norm_cohens_h_paraphrase_string_containment": -0.46,
     "score": -0.46,
-    "score_name": "fixed_group_norm_cohens_h_string_containment",
+    "score_name": "fixed_group_norm_cohens_h_paraphrase_string_containment",
     "score_ci_low": -0.5,
     "score_ci_high": -0.39,
-    "fixed_group_norm_cohens_h_string_containment_ci_low": -0.5,
-    "fixed_group_norm_cohens_h_string_containment_ci_high": -0.39,
+    "fixed_group_norm_cohens_h_paraphrase_string_containment_ci_low": -0.5,
+    "fixed_group_norm_cohens_h_paraphrase_string_containment_ci_high": -0.39,
 }
 
 
@@ -435,21 +441,21 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_norm_cohens_h_string_containment",
+    "metrics.robustness.fixed_group_norm_cohens_h_paraphrase_string_containment",
     overwrite=True,
 )
 
 
 # Cohen's D will always use fixed groups
-metric = FixedGroupCohensDAccuracy()
+metric = FixedGroupCohensDParaphraseAccuracy()
 global_target = {
-    "fixed_group_cohens_d_accuracy": -333.55,
-    "score": -333.55,
-    "score_name": "fixed_group_cohens_d_accuracy",
-    "score_ci_low": -1000.0,
+    "fixed_group_cohens_d_paraphrase_accuracy": -1.88,
+    "score": -1.88,
+    "score_name": "fixed_group_cohens_d_paraphrase_accuracy",
+    "score_ci_low": -5.0,
     "score_ci_high": 0.5,
-    "fixed_group_cohens_d_accuracy_ci_low": -1000.0,
-    "fixed_group_cohens_d_accuracy_ci_high": 0.5,
+    "fixed_group_cohens_d_paraphrase_accuracy_ci_low": -5.0,
+    "fixed_group_cohens_d_paraphrase_accuracy_ci_high": 0.5,
 }
 
 
@@ -463,19 +469,21 @@
 )
 
 add_to_catalog(
-    metric, "metrics.robustness.fixed_group_cohens_d_accuracy", overwrite=True
+    metric,
+    "metrics.robustness.fixed_group_cohens_d_paraphrase_accuracy",
+    overwrite=True,
 )
 
 
-metric = FixedGroupCohensDStringContainment()
+metric = FixedGroupCohensDParaphraseStringContainment()
 global_target = {
-    "fixed_group_cohens_d_string_containment": -0.77,
+    "fixed_group_cohens_d_paraphrase_string_containment": -0.77,
     "score": -0.77,
-    "score_name": "fixed_group_cohens_d_string_containment",
+    "score_name": "fixed_group_cohens_d_paraphrase_string_containment",
     "score_ci_low": -0.87,
     "score_ci_high": -0.58,
-    "fixed_group_cohens_d_string_containment_ci_low": -0.87,
-    "fixed_group_cohens_d_string_containment_ci_high": -0.58,
+    "fixed_group_cohens_d_paraphrase_string_containment_ci_low": -0.87,
+    "fixed_group_cohens_d_paraphrase_string_containment_ci_high": -0.58,
 }
 
 
@@ -490,7 +498,7 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_cohens_d_string_containment",
+    "metrics.robustness.fixed_group_cohens_d_paraphrase_string_containment",
     overwrite=True,
 )
 
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index c5d58eb82f..519f3f3562 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2119,7 +2119,7 @@ def should_ignore_element(self, element, additional_input):
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
 def validate_subgroup_types(
-    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
+    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List
 ):
     """Validate a dict of subgroup type instance score lists.
 
@@ -2140,6 +2140,14 @@ def validate_subgroup_types(
     )
     if expected_subgroup_types is None:
         expected_subgroup_types = []
+    else:
+        # expected_subgroup_types could be a list of lists; now take unique values
+        from itertools import chain
+
+        expected_subgroup_types = list(
+            set(chain.from_iterable(expected_subgroup_types))
+        )
+
     # make sure the expected types appear
     subgroup_scores_dict.update(
         {kk: [] for kk in expected_subgroup_types if kk not in subgroup_scores_dict}
@@ -2148,7 +2156,7 @@ def validate_subgroup_types(
 
 
 def performance_drop_rate(
-    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
+    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[List[str]]
 ):
     """Percentage decrease of mean performance on test elements relative to that on a baseline.
 
@@ -2157,8 +2165,9 @@ def performance_drop_rate(
     Args:
         subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
             of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
-            baseline type, the second the other).
+        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
+            Each list is the label subset whose average scores are to be compared.
+            The first group should be the baseline, the second the comparison group.
 
     Returns:
         numeric PDR metric.
@@ -2166,16 +2175,30 @@ def performance_drop_rate(
         otherwise, calculate PDR
 
     """
-    assert len(expected_subgroup_types) == 2
+    assert (
+        len(expected_subgroup_types) == 2
+    ), "expected_subgroup_types must have two elements"
+    assert all(
+        isinstance(vv, list) for vv in expected_subgroup_types
+    ), "each element of expected_subgroup_types must be a list"
+    # make sure each list is unique
+    expected_subgroup_types = [list(set(vv)) for vv in expected_subgroup_types]
+
     subgroup_scores_dict = validate_subgroup_types(
         subgroup_scores_dict, expected_subgroup_types
     )
-    if any(len(subgroup_scores_dict[kk]) == 0 for kk in expected_subgroup_types):
+    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
+        for vv in expected_subgroup_types
+    ]
+    if any(len(scores) == 0 for scores in group_scores_list):
         # no comparison can be made since there is not at least one score per type
         return np.nan
-    # first key should be the baseline category
-    baseline_mean = mean(subgroup_scores_dict[expected_subgroup_types[0]])
-    other_mean = mean(subgroup_scores_dict[expected_subgroup_types[1]])
+    # first group are baseline scores, second is others
+    baseline_mean = mean(group_scores_list[0])
+    other_mean = mean(group_scores_list[1])
+
     return np.nan if baseline_mean == 0 else 1 - other_mean / baseline_mean
 
 
@@ -2222,7 +2245,7 @@ def interpret_cohens_effect_size(x: float):
 
 def normalized_cohens_h(
     subgroup_scores_dict: Dict[str, List],
-    expected_subgroup_types: List[str],
+    expected_subgroup_types: List[List[str]],
     interpret=False,
 ):
     """Cohen's h effect size between two proportions, normalized to interval [-1,1].
@@ -2249,8 +2272,9 @@ def normalized_cohens_h(
     Args:
         subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
             of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
-            baseline type, the second the other).
+        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
+            Each list is the label subset whose average scores are to be compared.
+            The first group should be the baseline, the second the comparison group.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score between -1 and 1, and a string interpretation if interpret=True
@@ -2259,19 +2283,25 @@ def normalized_cohens_h(
     subgroup_scores_dict = validate_subgroup_types(
         subgroup_scores_dict, expected_subgroup_types
     )
-    if any(len(subgroup_scores_dict[kk]) == 0 for kk in expected_subgroup_types):
-        # no comparison can be made since there is not at least one score per type
-        return np.nan
+    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
+        for vv in expected_subgroup_types
+    ]
     # requires scores to be in [0,1]
-    for kk, score_list in subgroup_scores_dict.items():
-        assert all(
-            0 <= score <= 1 for score in score_list
-        ), f"all {kk} scores must be in [0,1]"
+    for scores in group_scores_list:
+        assert all(0 <= score <= 1 for score in scores), "all scores must be in [0,1]"
+
+    if any(len(scores) == 0 for scores in group_scores_list):
+        # no comparison can be made since there is not at least one score per type
+        h = np.nan
+        norm_h = np.nan
+    else:
+        baseline_mean = mean(group_scores_list[0])
+        other_mean = mean(group_scores_list[1])
+        h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
+        norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
 
-    baseline_mean = mean(subgroup_scores_dict[expected_subgroup_types[0]])
-    other_mean = mean(subgroup_scores_dict[expected_subgroup_types[1]])
-    h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
-    norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
     if not interpret:
         return norm_h
 
@@ -2280,7 +2310,7 @@ def normalized_cohens_h(
 
 def cohens_d(
     subgroup_scores_dict: Dict[str, List],
-    expected_subgroup_types: List[str],
+    expected_subgroup_types: List[List[str]],
     interpret=False,
 ):
     """Cohen's d effect size between mean of two samples.
@@ -2290,8 +2320,9 @@ def cohens_d(
     Args:
         subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
             of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: list of the expected labels in subgroup_scores_dict (the first should be the
-            baseline type, the second the other).
+        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
+            Each list is the label subset whose average scores are to be compared.
+            The first group should be the baseline, the second the comparison group.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score, and a string interpretation if interpret=True
@@ -2300,25 +2331,31 @@ def cohens_d(
     subgroup_scores_dict = validate_subgroup_types(
         subgroup_scores_dict, expected_subgroup_types
     )
-    group_n = [len(subgroup_scores_dict[st]) for st in expected_subgroup_types]
+    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
+        for vv in expected_subgroup_types
+    ]
+
+    group_n = [len(scores) for scores in group_scores_list]
     if any(nn == 0 for nn in group_n) or all(nn <= 1 for nn in group_n):
         # if at least one sample size is 0 for one type, no comparison can be made at all
         # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
         # so require at least one sample to have > 1 observation, and both to have >= 1.
-        return np.nan
-
-    # otherwise, calculate the variances
-    group_mean = [mean(subgroup_scores_dict[st]) for st in expected_subgroup_types]
-    # sample variance with 1 degree of freedom (denominator n-1); if n=1, return 0 since otherwise throws an error
-    group_var = [
-        0.0 if nn == 1 else np.var(subgroup_scores_dict[st], ddof=1)
-        for st, nn in zip(expected_subgroup_types, group_n)
-    ]
-    var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
-    pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
-    d = np.diff(group_mean)[0] / pooled_sd
-    # clip it at a very large value so it doesn't become infinite if the variance (denominator) is 0
-    d = np.clip(a=d, a_min=-1000, a_max=1000)
+        d = np.nan
+    else:
+        # otherwise, calculate the variances
+        group_mean = [mean(scores) for scores in group_scores_list]
+        # sample variance with 1 degree of freedom (denominator n-1); if n=1, return 0 since otherwise throws an error
+        group_var = [
+            0.0 if nn == 1 else np.var(scores, ddof=1)
+            for scores, nn in zip(group_scores_list, group_n)
+        ]
+        var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
+        pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
+        d = np.diff(group_mean)[0] / pooled_sd
+        # clip it at a very large value so it doesn't become infinite if the variance (denominator) is 0
+        d = float(np.clip(a=d, a_min=-5, a_max=5))
 
     if not interpret:
         return d
@@ -2337,16 +2374,14 @@ def mean_subgroup_score(
     Returns:
         float score
     """
+    expected_subgroup_types = list(set(expected_subgroup_types))
     subgroup_scores_dict = validate_subgroup_types(
         subgroup_scores_dict, expected_subgroup_types
     )
-    from itertools import chain
 
     # combine all desired subgroup scores
-    score_list = list(
-        chain.from_iterable(
-            [subgroup_scores_dict[st] for st in expected_subgroup_types]
-        )
+    score_list = np.concatenate(
+        [subgroup_scores_dict[st] for st in expected_subgroup_types]
     )
     if len(score_list) == 0:
         # no scores to use
@@ -2441,15 +2476,15 @@ class FixedGroupMeanParaphraseStringContainment(StringContainment):
 
 
 # using PDR
-class FixedGroupPDRAccuracy(Accuracy):
+class FixedGroupPDRParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "pdr",
+                "pdr_paraphrase",
                 lambda scd: performance_drop_rate(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
@@ -2457,15 +2492,15 @@ class FixedGroupPDRAccuracy(Accuracy):
     }
 
 
-class FixedGroupPDRStringContainment(StringContainment):
+class FixedGroupPDRParaphraseStringContainment(StringContainment):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "pdr",
+                "pdr_paraphrase",
                 lambda scd: performance_drop_rate(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
@@ -2483,15 +2518,15 @@ class GroupMeanTokenOverlap(TokenOverlap):
 
 
 # using Cohens's h for proportions
-class FixedGroupNormCohensHAccuracy(Accuracy):
+class FixedGroupNormCohensHParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "norm_cohens_h",
+                "norm_cohens_h_paraphrase",
                 lambda scd: normalized_cohens_h(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
@@ -2499,15 +2534,15 @@ class FixedGroupNormCohensHAccuracy(Accuracy):
     }
 
 
-class FixedGroupNormCohensHStringContainment(StringContainment):
+class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "norm_cohens_h",
+                "norm_cohens_h_paraphrase",
                 lambda scd: normalized_cohens_h(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
@@ -2516,15 +2551,15 @@ class FixedGroupNormCohensHStringContainment(StringContainment):
 
 
 # using Cohen's d (takes into account internal variation in group scores)
-class FixedGroupCohensDAccuracy(Accuracy):
+class FixedGroupCohensDParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "cohens_d",
+                "cohens_d_paraphrase",
                 lambda scd: cohens_d(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
@@ -2532,15 +2567,15 @@ class FixedGroupCohensDAccuracy(Accuracy):
     }
 
 
-class FixedGroupCohensDStringContainment(StringContainment):
+class FixedGroupCohensDParaphraseStringContainment(StringContainment):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "cohens_d",
+                "cohens_d_paraphrase",
                 lambda scd: cohens_d(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=["original", "paraphrase"],
+                    expected_subgroup_types=[["original"], ["paraphrase"]],
                 ),
                 True,
             ],
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 71a227a29d..6dba72c74b 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -11,18 +11,18 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
-    FixedGroupCohensDAccuracy,
-    FixedGroupCohensDStringContainment,
+    FixedGroupCohensDParaphraseAccuracy,
+    FixedGroupCohensDParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
     FixedGroupMeanParaphraseAccuracy,
     FixedGroupMeanParaphraseStringContainment,
     FixedGroupMeanStringContainment,
-    FixedGroupNormCohensHAccuracy,
-    FixedGroupNormCohensHStringContainment,
-    FixedGroupPDRAccuracy,
-    FixedGroupPDRStringContainment,
+    FixedGroupNormCohensHParaphraseAccuracy,
+    FixedGroupNormCohensHParaphraseStringContainment,
+    FixedGroupPDRParaphraseAccuracy,
+    FixedGroupPDRParaphraseStringContainment,
     GroupMeanAccuracy,
     GroupMeanStringContainment,
     GroupMeanTokenOverlap,
@@ -482,12 +482,12 @@ def test_grouped_instance_metrics(self):
             FixedGroupMeanBaselineStringContainment(),
             FixedGroupMeanParaphraseStringContainment(),
             GroupMeanTokenOverlap(),
-            FixedGroupNormCohensHAccuracy(),
-            FixedGroupNormCohensHStringContainment(),
-            FixedGroupPDRAccuracy(),
-            FixedGroupPDRStringContainment(),
-            FixedGroupCohensDAccuracy(),
-            FixedGroupCohensDStringContainment(),
+            FixedGroupNormCohensHParaphraseAccuracy(),
+            FixedGroupNormCohensHParaphraseStringContainment(),
+            FixedGroupPDRParaphraseAccuracy(),
+            FixedGroupPDRParaphraseStringContainment(),
+            FixedGroupCohensDParaphraseAccuracy(),
+            FixedGroupCohensDParaphraseStringContainment(),
         ]
         global_targets = [
             0.225,
@@ -503,7 +503,7 @@ def test_grouped_instance_metrics(self):
             -0.4639421840102023,
             0.8333333333333334,
             0.4444444444444445,
-            -333.55156684612643,
+            -1.8849001794597504,
             -0.7698003589195009,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
@@ -722,7 +722,7 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupNormCohensHAccuracy(),
+            metric=FixedGroupNormCohensHParaphraseAccuracy(),
             expected_ci_low=-1.0,
             expected_ci_high=0.33333333333333337,
         )
@@ -730,31 +730,31 @@ def test_grouped_instance_metric_confidence_interval(self):
         # note, this metric has an issue where the ci_high on PCs on Travis slightly diverges from the local results
         # hence this test may fail on a PC
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupNormCohensHStringContainment(),
+            metric=FixedGroupNormCohensHParaphraseStringContainment(),
             expected_ci_low=-0.49999999999999994,
             expected_ci_high=-0.39182655203060723,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupPDRAccuracy(),
+            metric=FixedGroupPDRParaphraseAccuracy(),
             expected_ci_low=0.6666666666666666,
             expected_ci_high=1.0,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupPDRStringContainment(),
+            metric=FixedGroupPDRParaphraseStringContainment(),
             expected_ci_low=0.3333333333333333,
             expected_ci_high=0.5,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupCohensDAccuracy(),
-            expected_ci_low=-1000.0,
+            metric=FixedGroupCohensDParaphraseAccuracy(),
+            expected_ci_low=-5.0,
             expected_ci_high=0.5,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupCohensDStringContainment(),
+            metric=FixedGroupCohensDParaphraseStringContainment(),
             expected_ci_low=-0.8660254037844387,
             expected_ci_high=-0.5773502691896257,
         )

From 9ad6aa7b1148393a99850170dd089894728397c6 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:38:17 +0200
Subject: [PATCH 57/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json

rename metric
---
 .../metrics/robustness/fixed_group_cohens_d_accuracy.json      | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json
deleted file mode 100644
index e60abead20..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_cohens_d_accuracy"
-}

From a74ab93808d522c5ec0cf174d96c7e6c0d406444 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:38:29 +0200
Subject: [PATCH 58/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json

rename metric
---
 .../robustness/fixed_group_cohens_d_string_containment.json    | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json
deleted file mode 100644
index e68f00a376..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_cohens_d_string_containment"
-}

From b0c3f5eac2626453a57ca9da72e1ff7d511a475a Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:38:50 +0200
Subject: [PATCH 59/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json

rename metric
---
 .../metrics/robustness/fixed_group_norm_cohens_h_accuracy.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json
deleted file mode 100644
index dc5f597162..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_norm_cohens_h_accuracy"
-}

From e7b0e00be99df20edc20af746430fdccbbb79a75 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:39:06 +0200
Subject: [PATCH 60/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json

rename metric
---
 .../fixed_group_norm_cohens_h_string_containment.json          | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json
deleted file mode 100644
index e72af8061e..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_norm_cohens_h_string_containment"
-}

From f061b1f1eee45ad0d38dc2ad34c718c0f5627f50 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:39:21 +0200
Subject: [PATCH 61/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json

rename metric
---
 .../catalog/metrics/robustness/fixed_group_pdr_accuracy.json   | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json
deleted file mode 100644
index 56b62a01f9..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_pdr_accuracy"
-}

From ca38302ad56b37b475642189766a3081b46e4f01 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Mon, 12 Feb 2024 12:39:34 +0200
Subject: [PATCH 62/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json

rename metric
---
 .../metrics/robustness/fixed_group_pdr_string_containment.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json
deleted file mode 100644
index 9b13641d2a..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_pdr_string_containment"
-}

From 8634264e4053a166961e39cd5818350f478bba6a Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Mon, 12 Feb 2024 12:40:07 +0200
Subject: [PATCH 63/83] rename to include string 'paraphrase' to distinguish
 from 'all variants'

---
 .../robustness/fixed_group_cohens_d_paraphrase_accuracy.json   | 3 +++
 .../fixed_group_cohens_d_paraphrase_string_containment.json    | 3 +++
 .../fixed_group_norm_cohens_h_paraphrase_accuracy.json         | 3 +++
 ...ixed_group_norm_cohens_h_paraphrase_string_containment.json | 3 +++
 .../robustness/fixed_group_pdr_paraphrase_accuracy.json        | 3 +++
 .../fixed_group_pdr_paraphrase_string_containment.json         | 3 +++
 6 files changed, 18 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json
new file mode 100644
index 0000000000..18a7e235fd
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_cohens_d_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json
new file mode 100644
index 0000000000..fef67b99e9
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_cohens_d_paraphrase_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_accuracy.json
new file mode 100644
index 0000000000..0b07d30f09
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_cohens_h_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_string_containment.json
new file mode 100644
index 0000000000..b27f65e1d3
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_cohens_h_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_cohens_h_paraphrase_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_accuracy.json
new file mode 100644
index 0000000000..ca0d01c6b5
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_pdr_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_string_containment.json
new file mode 100644
index 0000000000..461f3ab221
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_pdr_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_pdr_paraphrase_string_containment"
+}

From 7b16dd5510b2b63a20dd3ad2a888a08d5bd8f32f Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Tue, 13 Feb 2024 22:32:51 +0200
Subject: [PATCH 64/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json

rename to Hedges' g
---
 .../robustness/fixed_group_cohens_d_paraphrase_accuracy.json   | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json
deleted file mode 100644
index 18a7e235fd..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_cohens_d_paraphrase_accuracy"
-}

From e5c71cca380a3c292d9249c2e5c81ecc64e8c409 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Tue, 13 Feb 2024 22:33:14 +0200
Subject: [PATCH 65/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json

rename to Hedges' g
---
 .../fixed_group_cohens_d_paraphrase_string_containment.json    | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json
deleted file mode 100644
index fef67b99e9..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_cohens_d_paraphrase_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_cohens_d_paraphrase_string_containment"
-}

From 273c389ba4117d52e4d5caa3dfcb0afd81b43aef Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 13 Feb 2024 22:36:26 +0200
Subject: [PATCH 66/83] redefine Cohen's d as Hedge's g, with correction. for
 grouped comparison aggregations, use two list arguments rather than a single
 list argument with two sub-lists

---
 prepare/metrics/grouped_instance_metrics.py |  38 +--
 src/unitxt/metrics.py                       | 254 +++++++++++---------
 tests/test_metrics.py                       |  22 +-
 3 files changed, 169 insertions(+), 145 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 430c16f6db..e59313d227 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -4,8 +4,8 @@
 
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
-    FixedGroupCohensDParaphraseAccuracy,
-    FixedGroupCohensDParaphraseStringContainment,
+    FixedGroupHedgesGParaphraseAccuracy,
+    FixedGroupHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -447,15 +447,15 @@
 
 
 # Cohen's D will always use fixed groups
-metric = FixedGroupCohensDParaphraseAccuracy()
+metric = FixedGroupHedgesGParaphraseAccuracy()
 global_target = {
-    "fixed_group_cohens_d_paraphrase_accuracy": -1.88,
-    "score": -1.88,
-    "score_name": "fixed_group_cohens_d_paraphrase_accuracy",
+    "fixed_group_hedges_g_paraphrase_accuracy": -1.73,
+    "score": -1.73,
+    "score_name": "fixed_group_hedges_g_paraphrase_accuracy",
     "score_ci_low": -5.0,
-    "score_ci_high": 0.5,
-    "fixed_group_cohens_d_paraphrase_accuracy_ci_low": -5.0,
-    "fixed_group_cohens_d_paraphrase_accuracy_ci_high": 0.5,
+    "score_ci_high": 0.28,
+    "fixed_group_hedges_g_paraphrase_accuracy_ci_low": -5.0,
+    "fixed_group_hedges_g_paraphrase_accuracy_ci_high": 0.28,
 }
 
 
@@ -470,20 +470,20 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_cohens_d_paraphrase_accuracy",
+    "metrics.robustness.fixed_group_hedges_g_paraphrase_accuracy",
     overwrite=True,
 )
 
 
-metric = FixedGroupCohensDParaphraseStringContainment()
+metric = FixedGroupHedgesGParaphraseStringContainment()
 global_target = {
-    "fixed_group_cohens_d_paraphrase_string_containment": -0.77,
-    "score": -0.77,
-    "score_name": "fixed_group_cohens_d_paraphrase_string_containment",
-    "score_ci_low": -0.87,
-    "score_ci_high": -0.58,
-    "fixed_group_cohens_d_paraphrase_string_containment_ci_low": -0.87,
-    "fixed_group_cohens_d_paraphrase_string_containment_ci_high": -0.58,
+    "fixed_group_hedges_g_paraphrase_string_containment": -0.4,
+    "score": -0.4,
+    "score_name": "fixed_group_hedges_g_paraphrase_string_containment",
+    "score_ci_low": -0.49,
+    "score_ci_high": -0.23,
+    "fixed_group_hedges_g_paraphrase_string_containment_ci_low": -0.49,
+    "fixed_group_hedges_g_paraphrase_string_containment_ci_high": -0.23,
 }
 
 
@@ -498,7 +498,7 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_cohens_d_paraphrase_string_containment",
+    "metrics.robustness.fixed_group_hedges_g_paraphrase_string_containment",
     overwrite=True,
 )
 
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 23127e2404..eb0b8a324a 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -149,7 +149,7 @@ def score_based_confidence_interval(
             #   that is, re-form the groups, calculate the function, and take the mean of the group scores
             aggregation_func = self.average_item_scores
         for score_name in score_names:
-            # need to redefine the statistic function within the loop because score_name is a loop variable, to avoid ruff errors
+            # need to redefine the statistic function within the loop because score_name is a loop variable
             def statistic(arr, axis, score_name=score_name):
                 # arr is a 2d array where each row is a resampling, so we
                 # iterate over the rows and compute the metric on each resampling
@@ -444,7 +444,7 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
 
     InstanceMetric currently allows two reductions:
-    1. 'mean', which calculates the mean of instance scores,'
+    1. 'mean', which calculates the mean of instance scores,
     2. 'group_mean', which first applies an aggregation function specified in the reduction_map
         to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
         of the group scores; if grouping_field is None, grouping is disabled.
@@ -585,7 +585,8 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
                 )
                 aggregation_function_name = str(reduction_params["agg_func"][0])
                 field_name_full_prefix = "group_" + aggregation_function_name + "_"
-                if reduction_params["agg_func"][2]:
+                do_resample_as_group = reduction_params["agg_func"][2]
+                if do_resample_as_group:
                     # append fixed_ to name because resamples the groups as fixed units
                     field_name_full_prefix = "fixed_" + field_name_full_prefix
                 (
@@ -736,7 +737,8 @@ def _set_up_group_mean_aggregation(
     ):
         group_aggregation_func = reduction_params["agg_func"][1]
         # if treat groups as units
-        if reduction_params["agg_func"][2]:
+        do_resample_as_group = reduction_params["agg_func"][2]
+        if do_resample_as_group:
             # pass the group aggregate---not instance---scores to resample as usual
             aggregation_function = self.average_item_scores
             scores_to_resample = self.get_group_scores(
@@ -2124,91 +2126,88 @@ def should_ignore_element(self, element, additional_input):
 
 # define metrics that return means of an aggregation function applied across levels of a grouping variable
 def validate_subgroup_types(
-    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List
+    subgroup_scores_dict: Dict[str, List],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
 ):
-    """Validate a dict of subgroup type instance score lists.
+    """Validate a dict of subgroup type instance score lists, and subgroup type lists.
 
     Args:
         subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
-        expected_subgroup_types: list of the subgroup types which should exist in subgroup_scores_dict, so
-            that another function that receives it as input will valid.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
 
     Returns:
-        dict with all NaN scores removed; any expected keys that are missing have an empty list inserted
+        dict with all NaN scores removed; control_subgroup_types and comparison_subgroup_types will have non-unique elements removed
     """
+    # note: subgroup_scores_dict is already a defaultdict of lists, so don't need to check that keys in control_ and comparison_subgroup_types exist in it
     # remove any NaNs
     subgroup_scores_dict.update(
         {
-            kk: [vvv for vvv in vv if not np.isnan(vvv)]
-            for kk, vv in subgroup_scores_dict.items()
+            subgroup_name: [score for score in score_list if not np.isnan(score)]
+            for subgroup_name, score_list in subgroup_scores_dict.items()
         }
     )
-    if expected_subgroup_types is None:
-        expected_subgroup_types = []
-    else:
-        # expected_subgroup_types could be a list of lists; now take unique values
-        from itertools import chain
-
-        expected_subgroup_types = list(
-            set(chain.from_iterable(expected_subgroup_types))
-        )
+    assert isinstance(
+        control_subgroup_types, list
+    ), "control_subgroup_types must be a list"
+    assert isinstance(
+        comparison_subgroup_types, list
+    ), "comparison_subgroup_types must be a list"
+    # make sure each list is unique, so that labels aren't double-counted
+    control_subgroup_types = list(set(control_subgroup_types))
+    comparison_subgroup_types = list(set(comparison_subgroup_types))
 
-    # make sure the expected types appear
-    subgroup_scores_dict.update(
-        {kk: [] for kk in expected_subgroup_types if kk not in subgroup_scores_dict}
-    )
-    return subgroup_scores_dict
+    return subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
 
 
 def performance_drop_rate(
-    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[List[str]]
+    subgroup_scores_dict: Dict[str, List],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
 ):
-    """Percentage decrease of mean performance on test elements relative to that on a baseline.
+    """Percentage decrease of mean performance on test elements relative to that on a baseline (control).
 
     from https://arxiv.org/pdf/2306.04528.pdf.
 
     Args:
-        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
-            of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
-            Each list is the label subset whose average scores are to be compared.
-            The first group should be the baseline, the second the comparison group.
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
 
     Returns:
         numeric PDR metric.
         If only one element (no test set) or the first is 0 (percentage change is undefined) return NaN
         otherwise, calculate PDR
-
     """
-    assert (
-        len(expected_subgroup_types) == 2
-    ), "expected_subgroup_types must have two elements"
-    assert all(
-        isinstance(vv, list) for vv in expected_subgroup_types
-    ), "each element of expected_subgroup_types must be a list"
-    # make sure each list is unique
-    expected_subgroup_types = [list(set(vv)) for vv in expected_subgroup_types]
-
-    subgroup_scores_dict = validate_subgroup_types(
-        subgroup_scores_dict, expected_subgroup_types
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
     )
-    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+
+    # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
-        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
-        for vv in expected_subgroup_types
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
     ]
     if any(len(scores) == 0 for scores in group_scores_list):
         # no comparison can be made since there is not at least one score per type
         return np.nan
-    # first group are baseline scores, second is others
-    baseline_mean = mean(group_scores_list[0])
-    other_mean = mean(group_scores_list[1])
+    control_mean = mean(group_scores_list[0])
+    comparison_mean = mean(group_scores_list[1])
 
-    return np.nan if baseline_mean == 0 else 1 - other_mean / baseline_mean
+    return np.nan if control_mean == 0 else 1 - comparison_mean / control_mean
 
 
-def interpret_cohens_effect_size(x: float):
-    """Return a string interpretation of a Cohen effect size value.
+def interpret_effect_size(x: float):
+    """Return a string rule-of-thumb interpretation of an effect size value, as defined by Cohen/Sawilowsky.
 
     See https://en.wikipedia.org/wiki/Effect_size;
     Cohen, Jacob (1988). Statistical Power Analysis for the Behavioral Sciences; and
@@ -2250,7 +2249,8 @@ def interpret_cohens_effect_size(x: float):
 
 def normalized_cohens_h(
     subgroup_scores_dict: Dict[str, List],
-    expected_subgroup_types: List[List[str]],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
     interpret=False,
 ):
     """Cohen's h effect size between two proportions, normalized to interval [-1,1].
@@ -2275,71 +2275,84 @@ def normalized_cohens_h(
         - a very large difference if 0.38197186 <= |norm h| < 0.63661977
         - a huge difference if 0.63661977 <= |norm h|
     Args:
-        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
-            of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
-            Each list is the label subset whose average scores are to be compared.
-            The first group should be the baseline, the second the comparison group.
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score between -1 and 1, and a string interpretation if interpret=True
     """
-    assert len(expected_subgroup_types) == 2
-    subgroup_scores_dict = validate_subgroup_types(
-        subgroup_scores_dict, expected_subgroup_types
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
     )
-    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+
+    # requires scores to be in [0,1]
+    for subgroup_name, score_list in subgroup_scores_dict.items():
+        assert all(
+            0 <= score <= 1 for score in score_list
+        ), f"all {subgroup_name} scores must be in [0,1]"
+
+    # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
-        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
-        for vv in expected_subgroup_types
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
     ]
-    # requires scores to be in [0,1]
-    for scores in group_scores_list:
-        assert all(0 <= score <= 1 for score in scores), "all scores must be in [0,1]"
 
     if any(len(scores) == 0 for scores in group_scores_list):
         # no comparison can be made since there is not at least one score per type
-        h = np.nan
-        norm_h = np.nan
+        h, norm_h = np.nan, np.nan
     else:
-        baseline_mean = mean(group_scores_list[0])
-        other_mean = mean(group_scores_list[1])
-        h = 2 * (np.arcsin(np.sqrt(other_mean)) - np.arcsin(np.sqrt(baseline_mean)))
+        control_mean = mean(group_scores_list[0])
+        comparison_mean = mean(group_scores_list[1])
+        h = 2 * (np.arcsin(np.sqrt(comparison_mean)) - np.arcsin(np.sqrt(control_mean)))
         norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
 
     if not interpret:
         return norm_h
 
-    return norm_h, interpret_cohens_effect_size(h)
+    return norm_h, interpret_effect_size(h)
 
 
-def cohens_d(
+def hedges_g(
     subgroup_scores_dict: Dict[str, List],
-    expected_subgroup_types: List[List[str]],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
     interpret=False,
 ):
-    """Cohen's d effect size between mean of two samples.
+    """Hedge's g effect size between mean of two samples.  Better than Cohen's d for small sample sizes.
 
     Takes into account the variances within the samples, not just the means.
 
     Args:
-        subgroup_scores_dict: dict where keys are from the set ('original', 'paraphrase') and values are lists
-            of instance scores corresponding to subgroup_types with that key
-        expected_subgroup_types: 2-element list, each element is a list of strings (typically a single element),
-            Each list is the label subset whose average scores are to be compared.
-            The first group should be the baseline, the second the comparison group.
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
         float score, and a string interpretation if interpret=True
     """
-    assert len(expected_subgroup_types) == 2
-    subgroup_scores_dict = validate_subgroup_types(
-        subgroup_scores_dict, expected_subgroup_types
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
     )
-    # combine all scores from each sub-label (if there are more than 1 in each group) into a list
+
+    # combine all scores from each label (if there are more than 1 in each group) into a list
     group_scores_list = [
-        np.concatenate([subgroup_scores_dict[vvv] for vvv in vv])
-        for vv in expected_subgroup_types
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
     ]
 
     group_n = [len(scores) for scores in group_scores_list]
@@ -2347,7 +2360,7 @@ def cohens_d(
         # if at least one sample size is 0 for one type, no comparison can be made at all
         # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
         # so require at least one sample to have > 1 observation, and both to have >= 1.
-        d = np.nan
+        g = np.nan
     else:
         # otherwise, calculate the variances
         group_mean = [mean(scores) for scores in group_scores_list]
@@ -2358,35 +2371,40 @@ def cohens_d(
         ]
         var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
         pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
-        d = np.diff(group_mean)[0] / pooled_sd
+        g = float(group_mean[1] - group_mean[0]) / pooled_sd
+        n = sum(group_n)
+        if 3 < n < 50:
+            # small sample adjustment see https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
+            # the multiplier is 0 if n <= 3
+            g *= ((n - 3) / (n - 2.25)) * np.sqrt((n - 2) / n)
+
         # clip it at a very large value so it doesn't become infinite if the variance (denominator) is 0
-        d = float(np.clip(a=d, a_min=-5, a_max=5))
+        g = float(np.clip(a=g, a_min=-5, a_max=5))
 
     if not interpret:
-        return d
-    return d, interpret_cohens_effect_size(d)
+        return g
+    return g, interpret_effect_size(g)
 
 
 def mean_subgroup_score(
-    subgroup_scores_dict: Dict[str, List], expected_subgroup_types: List[str]
+    subgroup_scores_dict: Dict[str, List], subgroup_types: List[str]
 ):
     """Return the mean instance score for a subset (possibly a single type) of variants (not a comparison).
 
     Args:
         subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
-        expected_subgroup_types: the keys (subgroup types) for which the average will be computed
+        subgroup_types: the keys (subgroup types) for which the average will be computed.
 
     Returns:
         float score
     """
-    expected_subgroup_types = list(set(expected_subgroup_types))
-    subgroup_scores_dict = validate_subgroup_types(
-        subgroup_scores_dict, expected_subgroup_types
+    subgroup_scores_dict, subgroup_types, _ = validate_subgroup_types(
+        subgroup_scores_dict, subgroup_types, []
     )
 
     # combine all desired subgroup scores
     score_list = np.concatenate(
-        [subgroup_scores_dict[st] for st in expected_subgroup_types]
+        [subgroup_scores_dict[subgroup_name] for subgroup_name in subgroup_types]
     )
     if len(score_list) == 0:
         # no scores to use
@@ -2423,7 +2441,7 @@ class FixedGroupMeanBaselineAccuracy(Accuracy):
             "agg_func": [
                 "mean_baseline",
                 lambda scd: mean_subgroup_score(
-                    subgroup_scores_dict=scd, expected_subgroup_types=["original"]
+                    subgroup_scores_dict=scd, subgroup_types=["original"]
                 ),
                 True,
             ],
@@ -2439,7 +2457,7 @@ class FixedGroupMeanParaphraseAccuracy(Accuracy):
             "agg_func": [
                 "mean_paraphrase",
                 lambda scd: mean_subgroup_score(
-                    subgroup_scores_dict=scd, expected_subgroup_types=["paraphrase"]
+                    subgroup_scores_dict=scd, subgroup_types=["paraphrase"]
                 ),
                 True,
             ],
@@ -2456,7 +2474,7 @@ class FixedGroupMeanBaselineStringContainment(StringContainment):
             "agg_func": [
                 "mean_baseline",
                 lambda scd: mean_subgroup_score(
-                    subgroup_scores_dict=scd, expected_subgroup_types=["original"]
+                    subgroup_scores_dict=scd, subgroup_types=["original"]
                 ),
                 True,
             ],
@@ -2472,7 +2490,7 @@ class FixedGroupMeanParaphraseStringContainment(StringContainment):
             "agg_func": [
                 "mean_paraphrase",
                 lambda scd: mean_subgroup_score(
-                    subgroup_scores_dict=scd, expected_subgroup_types=["paraphrase"]
+                    subgroup_scores_dict=scd, subgroup_types=["paraphrase"]
                 ),
                 True,
             ],
@@ -2489,7 +2507,8 @@ class FixedGroupPDRParaphraseAccuracy(Accuracy):
                 "pdr_paraphrase",
                 lambda scd: performance_drop_rate(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
@@ -2505,7 +2524,8 @@ class FixedGroupPDRParaphraseStringContainment(StringContainment):
                 "pdr_paraphrase",
                 lambda scd: performance_drop_rate(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
@@ -2531,7 +2551,8 @@ class FixedGroupNormCohensHParaphraseAccuracy(Accuracy):
                 "norm_cohens_h_paraphrase",
                 lambda scd: normalized_cohens_h(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
@@ -2547,7 +2568,8 @@ class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
                 "norm_cohens_h_paraphrase",
                 lambda scd: normalized_cohens_h(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
@@ -2556,15 +2578,16 @@ class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
 
 
 # using Cohen's d (takes into account internal variation in group scores)
-class FixedGroupCohensDParaphraseAccuracy(Accuracy):
+class FixedGroupHedgesGParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "cohens_d_paraphrase",
-                lambda scd: cohens_d(
+                "hedges_g_paraphrase",
+                lambda scd: hedges_g(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
@@ -2572,15 +2595,16 @@ class FixedGroupCohensDParaphraseAccuracy(Accuracy):
     }
 
 
-class FixedGroupCohensDParaphraseStringContainment(StringContainment):
+class FixedGroupHedgesGParaphraseStringContainment(StringContainment):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "cohens_d_paraphrase",
-                lambda scd: cohens_d(
+                "hedges_g_paraphrase",
+                lambda scd: hedges_g(
                     subgroup_scores_dict=scd,
-                    expected_subgroup_types=[["original"], ["paraphrase"]],
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
                 ),
                 True,
             ],
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 6dba72c74b..757b133810 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -11,8 +11,8 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
-    FixedGroupCohensDParaphraseAccuracy,
-    FixedGroupCohensDParaphraseStringContainment,
+    FixedGroupHedgesGParaphraseAccuracy,
+    FixedGroupHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -486,8 +486,8 @@ def test_grouped_instance_metrics(self):
             FixedGroupNormCohensHParaphraseStringContainment(),
             FixedGroupPDRParaphraseAccuracy(),
             FixedGroupPDRParaphraseStringContainment(),
-            FixedGroupCohensDParaphraseAccuracy(),
-            FixedGroupCohensDParaphraseStringContainment(),
+            FixedGroupHedgesGParaphraseAccuracy(),
+            FixedGroupHedgesGParaphraseStringContainment(),
         ]
         global_targets = [
             0.225,
@@ -503,8 +503,8 @@ def test_grouped_instance_metrics(self):
             -0.4639421840102023,
             0.8333333333333334,
             0.4444444444444445,
-            -1.8849001794597504,
-            -0.7698003589195009,
+            -1.7282993195760106,
+            -0.4030078304086706,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -748,15 +748,15 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupCohensDParaphraseAccuracy(),
+            metric=FixedGroupHedgesGParaphraseAccuracy(),
             expected_ci_low=-5.0,
-            expected_ci_high=0.5,
+            expected_ci_high=0.28167151608781216,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupCohensDParaphraseStringContainment(),
-            expected_ci_low=-0.8660254037844387,
-            expected_ci_high=-0.5773502691896257,
+            metric=FixedGroupHedgesGParaphraseStringContainment(),
+            expected_ci_low=-0.4878693769090451,
+            expected_ci_high=-0.23328473740792172,
         )
 
         # pass global dict because there are additional fields other than the main score

From a480683de69e6f228fdb4a260a7f4dff389c2543 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 13 Feb 2024 22:53:58 +0200
Subject: [PATCH 67/83] rename Cohen's d

---
 .../robustness/fixed_group_hedges_g_paraphrase_accuracy.json   | 3 +++
 .../fixed_group_hedges_g_paraphrase_string_containment.json    | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json
new file mode 100644
index 0000000000..5028138883
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_hedges_g_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json
new file mode 100644
index 0000000000..245602d8ce
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_hedges_g_paraphrase_string_containment"
+}

From fe1bde6f620e5386b416cec38a0509f486e47a15 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 13 Feb 2024 23:18:16 +0200
Subject: [PATCH 68/83] add ZeroDivisionError in Hedge's g

---
 prepare/metrics/grouped_instance_metrics.py |  2 +-
 src/unitxt/metrics.py                       | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index e59313d227..4b97bad198 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -446,7 +446,7 @@
 )
 
 
-# Cohen's D will always use fixed groups
+# Hedge's g will always use fixed groups
 metric = FixedGroupHedgesGParaphraseAccuracy()
 global_target = {
     "fixed_group_hedges_g_paraphrase_accuracy": -1.73,
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 5b20495756..14d3a87f81 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2398,15 +2398,21 @@ def hedges_g(
         ]
         var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
         pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
-        g = float(group_mean[1] - group_mean[0]) / pooled_sd
+
+        max_absolute_value = 5
+        try:
+            g = float(group_mean[1] - group_mean[0]) / pooled_sd
+        except ZeroDivisionError:
+            # return a large effect size to avoid explosion if there is zero variance
+            g = np.sign(group_mean[1] - group_mean[0]) * max_absolute_value
+
         n = sum(group_n)
         if 3 < n < 50:
             # small sample adjustment see https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
             # the multiplier is 0 if n <= 3
             g *= ((n - 3) / (n - 2.25)) * np.sqrt((n - 2) / n)
-
-        # clip it at a very large value so it doesn't become infinite if the variance (denominator) is 0
-        g = float(np.clip(a=g, a_min=-5, a_max=5))
+        # clip it at a very large value so it doesn't become infinite if the variance (denominator) is very small or 0
+        g = float(np.clip(a=g, a_min=-1 * max_absolute_value, a_max=max_absolute_value))
 
     if not interpret:
         return g

From 18ec140777f44f45acee55f006dcd94d90c7faf0 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 14 Feb 2024 11:34:54 +0200
Subject: [PATCH 69/83] rename Hedges g to Norm Hedges g, and divide by maximum
 to rescale to -1, 1

---
 prepare/metrics/grouped_instance_metrics.py | 40 ++++++++++----------
 src/unitxt/metrics.py                       | 41 ++++++++++++---------
 tests/library/test_metrics.py               | 24 ++++++------
 3 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 4b97bad198..085efedd4a 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -4,8 +4,6 @@
 
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
-    FixedGroupHedgesGParaphraseAccuracy,
-    FixedGroupHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -14,6 +12,8 @@
     FixedGroupMeanStringContainment,
     FixedGroupNormCohensHParaphraseAccuracy,
     FixedGroupNormCohensHParaphraseStringContainment,
+    FixedGroupNormHedgesGParaphraseAccuracy,
+    FixedGroupNormHedgesGParaphraseStringContainment,
     FixedGroupPDRParaphraseAccuracy,
     FixedGroupPDRParaphraseStringContainment,
     GroupMeanAccuracy,
@@ -447,15 +447,15 @@
 
 
 # Hedge's g will always use fixed groups
-metric = FixedGroupHedgesGParaphraseAccuracy()
+metric = FixedGroupNormHedgesGParaphraseAccuracy()
 global_target = {
-    "fixed_group_hedges_g_paraphrase_accuracy": -1.73,
-    "score": -1.73,
-    "score_name": "fixed_group_hedges_g_paraphrase_accuracy",
-    "score_ci_low": -5.0,
-    "score_ci_high": 0.28,
-    "fixed_group_hedges_g_paraphrase_accuracy_ci_low": -5.0,
-    "fixed_group_hedges_g_paraphrase_accuracy_ci_high": 0.28,
+    "fixed_group_norm_hedges_g_paraphrase_accuracy": -0.35,
+    "score": -0.35,
+    "score_name": "fixed_group_norm_hedges_g_paraphrase_accuracy",
+    "score_ci_low": -1.0,
+    "score_ci_high": 0.02,
+    "fixed_group_norm_hedges_g_paraphrase_accuracy_ci_low": -1.0,
+    "fixed_group_norm_hedges_g_paraphrase_accuracy_ci_high": 0.02,
 }
 
 
@@ -470,20 +470,20 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_hedges_g_paraphrase_accuracy",
+    "metrics.robustness.fixed_group_norm_hedges_g_paraphrase_accuracy",
     overwrite=True,
 )
 
 
-metric = FixedGroupHedgesGParaphraseStringContainment()
+metric = FixedGroupNormHedgesGParaphraseStringContainment()
 global_target = {
-    "fixed_group_hedges_g_paraphrase_string_containment": -0.4,
-    "score": -0.4,
-    "score_name": "fixed_group_hedges_g_paraphrase_string_containment",
-    "score_ci_low": -0.49,
-    "score_ci_high": -0.23,
-    "fixed_group_hedges_g_paraphrase_string_containment_ci_low": -0.49,
-    "fixed_group_hedges_g_paraphrase_string_containment_ci_high": -0.23,
+    "fixed_group_norm_hedges_g_paraphrase_string_containment": -0.08,
+    "score": -0.08,
+    "score_name": "fixed_group_norm_hedges_g_paraphrase_string_containment",
+    "score_ci_low": -0.1,
+    "score_ci_high": -0.05,
+    "fixed_group_norm_hedges_g_paraphrase_string_containment_ci_low": -0.1,
+    "fixed_group_norm_hedges_g_paraphrase_string_containment_ci_high": -0.05,
 }
 
 
@@ -498,7 +498,7 @@
 
 add_to_catalog(
     metric,
-    "metrics.robustness.fixed_group_hedges_g_paraphrase_string_containment",
+    "metrics.robustness.fixed_group_norm_hedges_g_paraphrase_string_containment",
     overwrite=True,
 )
 
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 14d3a87f81..3396284b63 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2347,13 +2347,13 @@ def normalized_cohens_h(
     return norm_h, interpret_effect_size(h)
 
 
-def hedges_g(
+def normalized_hedges_g(
     subgroup_scores_dict: Dict[str, List],
     control_subgroup_types: List[str],
     comparison_subgroup_types: List[str],
     interpret=False,
 ):
-    """Hedge's g effect size between mean of two samples.  Better than Cohen's d for small sample sizes.
+    """Hedge's g effect size between mean of two samples, normalized to interval [-1,1].  Better than Cohen's d for small sample sizes.
 
     Takes into account the variances within the samples, not just the means.
 
@@ -2364,7 +2364,7 @@ def hedges_g(
             to be compared to the control group.
         interpret: boolean, whether to interpret the significance of the score or not
     Returns:
-        float score, and a string interpretation if interpret=True
+        float score between -1 and 1, and a string interpretation if interpret=True
     """
     (
         subgroup_scores_dict,
@@ -2387,7 +2387,7 @@ def hedges_g(
         # if at least one sample size is 0 for one type, no comparison can be made at all
         # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
         # so require at least one sample to have > 1 observation, and both to have >= 1.
-        g = np.nan
+        g, norm_g = np.nan, np.nan
     else:
         # otherwise, calculate the variances
         group_mean = [mean(scores) for scores in group_scores_list]
@@ -2400,11 +2400,17 @@ def hedges_g(
         pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
 
         max_absolute_value = 5
-        try:
-            g = float(group_mean[1] - group_mean[0]) / pooled_sd
-        except ZeroDivisionError:
-            # return a large effect size to avoid explosion if there is zero variance
-            g = np.sign(group_mean[1] - group_mean[0]) * max_absolute_value
+        gmd = float(group_mean[1] - group_mean[0])
+
+        if gmd == 0:
+            # if exactly the same, return 0
+            g = 0.0
+        else:
+            try:
+                g = gmd / pooled_sd
+            except ZeroDivisionError:
+                # return a large effect size to avoid explosion if there is zero variance
+                g = np.sign(gmd) * max_absolute_value
 
         n = sum(group_n)
         if 3 < n < 50:
@@ -2413,10 +2419,11 @@ def hedges_g(
             g *= ((n - 3) / (n - 2.25)) * np.sqrt((n - 2) / n)
         # clip it at a very large value so it doesn't become infinite if the variance (denominator) is very small or 0
         g = float(np.clip(a=g, a_min=-1 * max_absolute_value, a_max=max_absolute_value))
+        norm_g = g / max_absolute_value
 
     if not interpret:
-        return g
-    return g, interpret_effect_size(g)
+        return norm_g
+    return norm_g, interpret_effect_size(g)
 
 
 def mean_subgroup_score(
@@ -2611,13 +2618,13 @@ class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
 
 
 # using Cohen's d (takes into account internal variation in group scores)
-class FixedGroupHedgesGParaphraseAccuracy(Accuracy):
+class FixedGroupNormHedgesGParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "hedges_g_paraphrase",
-                lambda scd: hedges_g(
+                "norm_hedges_g_paraphrase",
+                lambda scd: normalized_hedges_g(
                     subgroup_scores_dict=scd,
                     control_subgroup_types=["original"],
                     comparison_subgroup_types=["paraphrase"],
@@ -2628,13 +2635,13 @@ class FixedGroupHedgesGParaphraseAccuracy(Accuracy):
     }
 
 
-class FixedGroupHedgesGParaphraseStringContainment(StringContainment):
+class FixedGroupNormHedgesGParaphraseStringContainment(StringContainment):
     subgroup_column = "variant_type"
     reduction_map = {
         "group_mean": {
             "agg_func": [
-                "hedges_g_paraphrase",
-                lambda scd: hedges_g(
+                "norm_hedges_g_paraphrase",
+                lambda scd: normalized_hedges_g(
                     subgroup_scores_dict=scd,
                     control_subgroup_types=["original"],
                     comparison_subgroup_types=["paraphrase"],
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
index 757b133810..bd11b9a1d7 100644
--- a/tests/library/test_metrics.py
+++ b/tests/library/test_metrics.py
@@ -11,8 +11,6 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
-    FixedGroupHedgesGParaphraseAccuracy,
-    FixedGroupHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -21,6 +19,8 @@
     FixedGroupMeanStringContainment,
     FixedGroupNormCohensHParaphraseAccuracy,
     FixedGroupNormCohensHParaphraseStringContainment,
+    FixedGroupNormHedgesGParaphraseAccuracy,
+    FixedGroupNormHedgesGParaphraseStringContainment,
     FixedGroupPDRParaphraseAccuracy,
     FixedGroupPDRParaphraseStringContainment,
     GroupMeanAccuracy,
@@ -486,8 +486,8 @@ def test_grouped_instance_metrics(self):
             FixedGroupNormCohensHParaphraseStringContainment(),
             FixedGroupPDRParaphraseAccuracy(),
             FixedGroupPDRParaphraseStringContainment(),
-            FixedGroupHedgesGParaphraseAccuracy(),
-            FixedGroupHedgesGParaphraseStringContainment(),
+            FixedGroupNormHedgesGParaphraseAccuracy(),
+            FixedGroupNormHedgesGParaphraseStringContainment(),
         ]
         global_targets = [
             0.225,
@@ -503,8 +503,8 @@ def test_grouped_instance_metrics(self):
             -0.4639421840102023,
             0.8333333333333334,
             0.4444444444444445,
-            -1.7282993195760106,
-            -0.4030078304086706,
+            -0.34565986391520215,
+            -0.08060156608173413,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -748,15 +748,15 @@ def test_grouped_instance_metric_confidence_interval(self):
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupHedgesGParaphraseAccuracy(),
-            expected_ci_low=-5.0,
-            expected_ci_high=0.28167151608781216,
+            metric=FixedGroupNormHedgesGParaphraseAccuracy(),
+            expected_ci_low=-1.0,
+            expected_ci_high=0.01892225367237965,
         )
 
         self._test_grouped_instance_confidence_interval(
-            metric=FixedGroupHedgesGParaphraseStringContainment(),
-            expected_ci_low=-0.4878693769090451,
-            expected_ci_high=-0.23328473740792172,
+            metric=FixedGroupNormHedgesGParaphraseStringContainment(),
+            expected_ci_low=-0.09757387538180902,
+            expected_ci_high=-0.046656947481584346,
         )
 
         # pass global dict because there are additional fields other than the main score

From 7552e59c9cc8c4bedbbf5c126e69cc2ccddf52a9 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 14 Feb 2024 11:43:02 +0200
Subject: [PATCH 70/83] initial commit, rename from hedges_g

---
 .../fixed_group_norm_hedges_g_paraphrase_accuracy.json         | 3 +++
 ...ixed_group_norm_hedges_g_paraphrase_string_containment.json | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_accuracy.json
new file mode 100644
index 0000000000..5d4d03a81f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_hedges_g_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_string_containment.json
new file mode 100644
index 0000000000..9332b2c63c
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_norm_hedges_g_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_norm_hedges_g_paraphrase_string_containment"
+}

From 4dbd997a0d6afc1053e15bbe5b7f5be0d58fb673 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Wed, 14 Feb 2024 11:43:56 +0200
Subject: [PATCH 71/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json

rename to norm hedges g
---
 .../fixed_group_hedges_g_paraphrase_string_containment.json    | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json
deleted file mode 100644
index 245602d8ce..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_string_containment.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_hedges_g_paraphrase_string_containment"
-}

From f014bc6a273064771001c18fef7357fc3b366591 Mon Sep 17 00:00:00 2001
From: sam-data-guy-iam <124508972+sam-data-guy-iam@users.noreply.github.com>
Date: Wed, 14 Feb 2024 11:44:05 +0200
Subject: [PATCH 72/83] Delete
 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json

rename to norm hedges g
---
 .../robustness/fixed_group_hedges_g_paraphrase_accuracy.json   | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json
deleted file mode 100644
index 5028138883..0000000000
--- a/src/unitxt/catalog/metrics/robustness/fixed_group_hedges_g_paraphrase_accuracy.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "type": "fixed_group_hedges_g_paraphrase_accuracy"
-}

From 04cec38262300df26613ac71f73d9a255939a384 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 14 Feb 2024 11:53:26 +0200
Subject: [PATCH 73/83] fix PDR so if both means are 0, return 0 rather than
 NaN

---
 src/unitxt/metrics.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 3396284b63..f38d71fa3b 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -2229,8 +2229,13 @@ def performance_drop_rate(
         return np.nan
     control_mean = mean(group_scores_list[0])
     comparison_mean = mean(group_scores_list[1])
-
-    return np.nan if control_mean == 0 else 1 - comparison_mean / control_mean
+    if control_mean == 0:
+        # return 0 if comparison is also 0
+        if comparison_mean == 0:
+            return 0
+        return np.nan
+    # otherwise, take the percentage change (which may also be 0)
+    return 1 - comparison_mean / control_mean
 
 
 def interpret_effect_size(x: float):

From be7410e33a99621930c0cc9ccf3e64a8d7f51faf Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 14 Feb 2024 18:14:07 +0200
Subject: [PATCH 74/83] final PR changes, remove agg_func definition

---
 src/unitxt/metrics.py | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index f38d71fa3b..9e0af28f31 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -663,20 +663,20 @@ def compute_instance_scores(
         return instances, global_score
 
     def get_group_scores(
-        self, instances: List[dict], field_names: List[str], group_aggregation_func
+        self, instances: List[dict], score_names: List[str], group_aggregation_func
     ):
-        """Return a list of group aggregation function value for group_mean reduction.
+        """Group scores by the group_id and subgroup_type fields of each instance, and compute group_aggregation_func by group.
 
         Args:
             instances: List of observation instances with instance-level scores (fields) computed.
-            field_names: List of instance score names in each instance to apply the aggregation function.
+            score_names: List of instance score names in each instance to apply the aggregation function.
             group_aggregation_func: Callable aggregation function accepting a list of numeric scores;
                 or, if self.subgroup_column is not None, a dict of subgroup types scores by subgroup_column value.
                 callable function returns a single score for the group
 
         Returns:
             List of dicts, each corresponding to a group of instances (defined by 'group_id'),
-                with a group score for each field_name
+                with an aggregate group score for each score_name
         """
         from collections import defaultdict
 
@@ -688,22 +688,13 @@ def get_group_scores(
 
         # check if function has fields for subgroup_column
         uses_subgroups = self.subgroup_column is not None
+        default_subgroup_name = "default"
         if uses_subgroups:
             assert all(
                 self.subgroup_column in instance["additional_inputs"]
                 for instance in instances
             ), f"all instances must have field {self.subgroup_column}' in additional_inputs"
 
-            # define the aggregation function
-            def agg_func(subgroup_scores_dict):
-                # if function a uses the subgroup_column values, pass the full dict
-                return group_aggregation_func(subgroup_scores_dict)
-        else:
-
-            def agg_func(subgroup_scores_dict):
-                # otherwise pass the default 'original' scores to the default argument
-                return group_aggregation_func(subgroup_scores_dict["default"])
-
         # loop through the instances and group the scores
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
@@ -716,20 +707,26 @@ def agg_func(subgroup_scores_dict):
             # for functions that do comparisons between subgroup_column groups
             # if function doesn't use subgroup_column, or none is present, set "default" as default value, and pass all scores
             subgroup_type = (
-                additional_inputs[self.subgroup_column] if uses_subgroups else "default"
+                additional_inputs[self.subgroup_column]
+                if uses_subgroups
+                else default_subgroup_name
             )
-            for field_name in field_names:
-                group_to_instance_scores[group_key][field_name][subgroup_type].append(
-                    instance["score"]["instance"][field_name]
+            for score_name in score_names:
+                group_to_instance_scores[group_key][score_name][subgroup_type].append(
+                    instance["score"]["instance"][score_name]
                 )
 
-        # now apply the appropriate aggregation function to each group
+        # if group_aggregation_func expects a subgroup-types score dict, pass it; otherwise pass the default type list of scores
         return [
             {
                 "score": {
                     "instance": {
-                        field_name: agg_func(score_dict)
-                        for field_name, score_dict in group_scores.items()
+                        score_name: group_aggregation_func(
+                            score_dict
+                            if uses_subgroups
+                            else score_dict[default_subgroup_name]
+                        )
+                        for score_name, score_dict in group_scores.items()
                     }
                 }
             }
@@ -2151,7 +2148,6 @@ def should_ignore_element(self, element, additional_input):
         return element == "none"
 
 
-# define metrics that return means of an aggregation function applied across levels of a grouping variable
 def validate_subgroup_types(
     subgroup_scores_dict: Dict[str, List],
     control_subgroup_types: List[str],
@@ -2353,7 +2349,7 @@ def normalized_cohens_h(
 
 
 def normalized_hedges_g(
-    subgroup_scores_dict: Dict[str, List],
+    subgroup_scores_dict: Dict[str, List[float]],
     control_subgroup_types: List[str],
     comparison_subgroup_types: List[str],
     interpret=False,

From 3a98a47d72a4a4cfbb74e27add2aec848b3df074 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Wed, 14 Feb 2024 18:37:48 +0200
Subject: [PATCH 75/83] remove checks on instances in get_group_scores that
 were already validated

---
 src/unitxt/metrics.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 9e0af28f31..7390382d7f 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -689,20 +689,9 @@ def get_group_scores(
         # check if function has fields for subgroup_column
         uses_subgroups = self.subgroup_column is not None
         default_subgroup_name = "default"
-        if uses_subgroups:
-            assert all(
-                self.subgroup_column in instance["additional_inputs"]
-                for instance in instances
-            ), f"all instances must have field {self.subgroup_column}' in additional_inputs"
-
         # loop through the instances and group the scores
         for instance in instances:
             additional_inputs = instance["additional_inputs"]
-            if "group_id" not in additional_inputs:
-                raise ValueError(
-                    f"Missing 'group_id' from instance {instance}. "
-                    f"This field is required for group based metric computation."
-                )
             group_key = additional_inputs["group_id"]
             # for functions that do comparisons between subgroup_column groups
             # if function doesn't use subgroup_column, or none is present, set "default" as default value, and pass all scores

From 81dbff480261def927115e5760cf5b5048ccf798 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Thu, 15 Feb 2024 12:22:15 +0200
Subject: [PATCH 76/83] remove deepcopy

---
 src/unitxt/metrics.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 7390382d7f..93f63911f3 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -4,7 +4,6 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import Counter
-from copy import deepcopy
 from dataclasses import field
 from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -566,7 +565,6 @@ def accuracy_diff(subgroup_scores_dict, expected_subgroup_types=['original', 'pa
 
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances, global_score = self.compute_instance_scores(stream)
-        from copy import deepcopy
 
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
@@ -579,7 +577,7 @@ def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generato
             if reduction_type == "mean":
                 reduction_fields = list(set(reduction_params))
                 # no group reduction, so resample instances individually
-                scores_to_resample = deepcopy(instances)
+                scores_to_resample = instances
             elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction(instances=instances)
                 reduction_fields = (
@@ -736,7 +734,7 @@ def _set_up_group_mean_aggregation(
             )
         else:
             # pass the instance scores to resample, and calculate the group aggregation on the resamplings
-            scores_to_resample = deepcopy(instances)
+            scores_to_resample = instances
 
             def aggregation_function(
                 instances,

From bcc1ae05296a91a6d89fe37aff6ddc49a4f7c746 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 13:17:54 +0200
Subject: [PATCH 77/83] fix some comments and parameter names.  Make
 TokenOverlap do conversion of prediction and reference to strings internally.

---
 src/unitxt/metrics.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 93f63911f3..e234dadab7 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -98,15 +98,15 @@ def _can_compute_confidence_intervals(self, num_predictions):
         )
 
     @staticmethod
-    def average_item_scores(instances: List[dict], field_name: str):
-        """Calculate mean of a set of instance scores (given by field_name), omitting NaN values.
+    def average_item_scores(instances: List[dict], score_name: str):
+        """Calculate mean of a set of instance scores (given by score_name), omitting NaN values.
 
         Args:
             instances: list of dicts of each instance's instance scores.
-            field_name: score field names to compute mean for.
+            score_name: score field names to compute the mean for.
         """
         return nan_mean(
-            [instance["score"]["instance"][field_name] for instance in instances]
+            [instance["score"]["instance"][score_name] for instance in instances]
         )
 
     def score_based_confidence_interval(
@@ -455,7 +455,10 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         default_factory=lambda: settings.num_resamples_for_instance_metrics
     )
 
-    # column required to be in additional_inputs if group_mean aggregation function requires a dict input of labels and their lists of scores
+    # some group_mean aggregation functions (3rd element of "agg_func" list in the reduction)
+    # only require a list of instance scores (e.g., mean, median, etc.).  Others aggregation functions
+    # require an additional column (e.g., a subgroup identifier) by which the instance scores will be grouped
+    # if subgroup_column is not None, a column by the specified name will be required in additional_inputs
     subgroup_column = None
     implemented_reductions: List[str] = field(
         default_factory=lambda: ["mean", "group_mean"]
@@ -1557,8 +1560,8 @@ def compute(
     def _compute_single_ref(
         self, reference: Any, prediction: Any
     ) -> Tuple[float, float, float]:
-        prediction_tokens = normalize_answer(prediction).split()
-        reference_tokens = normalize_answer(reference).split()
+        prediction_tokens = normalize_answer(str(prediction)).split()
+        reference_tokens = normalize_answer(str(reference)).split()
         common = Counter(prediction_tokens) & Counter(reference_tokens)
         num_same = sum(common.values())
         if num_same == 0:

From ccf447f4d8e8949b55543365ac450fcaec487b4c Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 16:55:18 +0200
Subject: [PATCH 78/83] initial commit

---
 .../fixed_group_absval_norm_cohens_h_paraphrase_accuracy.json  | 3 +++
 ...oup_absval_norm_cohens_h_paraphrase_string_containment.json | 3 +++
 .../fixed_group_absval_norm_hedges_g_paraphrase_accuracy.json  | 3 +++
 ...oup_absval_norm_hedges_g_paraphrase_string_containment.json | 3 +++
 4 files changed, 12 insertions(+)
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_string_containment.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_accuracy.json
 create mode 100644 src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_string_containment.json

diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_accuracy.json
new file mode 100644
index 0000000000..6e3ba9893f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_absval_norm_cohens_h_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_string_containment.json
new file mode 100644
index 0000000000..a15032ba98
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_cohens_h_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_absval_norm_cohens_h_paraphrase_string_containment"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_accuracy.json b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_accuracy.json
new file mode 100644
index 0000000000..0331fae5df
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_accuracy.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_absval_norm_hedges_g_paraphrase_accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_string_containment.json b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_string_containment.json
new file mode 100644
index 0000000000..8a268790da
--- /dev/null
+++ b/src/unitxt/catalog/metrics/robustness/fixed_group_absval_norm_hedges_g_paraphrase_string_containment.json
@@ -0,0 +1,3 @@
+{
+    "type": "fixed_group_absval_norm_hedges_g_paraphrase_string_containment"
+}

From 7472623051026520e15cdc6192f2d98219292be1 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 17:13:32 +0200
Subject: [PATCH 79/83] add absolute value version of Hedges G / Cohens H

---
 src/unitxt/metrics.py | 82 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 16030fb0d5..17822be274 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -1585,7 +1585,8 @@ def compute(
         self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
     ) -> dict:
         results = [
-            self._compute_single_ref(reference, prediction) for reference in references
+            self._compute_single_ref(str(reference), str(prediction))
+            for reference in references
         ]
         return {
             measure: max(r[i] for r in results)
@@ -2643,7 +2644,7 @@ class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
     }
 
 
-# using Cohen's d (takes into account internal variation in group scores)
+# using Hedges' g (takes into account internal variation in group scores)
 class FixedGroupNormHedgesGParaphraseAccuracy(Accuracy):
     subgroup_column = "variant_type"
     reduction_map = {
@@ -2676,3 +2677,80 @@ class FixedGroupNormHedgesGParaphraseStringContainment(StringContainment):
             ],
         }
     }
+
+
+# for above metrics, take absolute value of group score first; this measures variation in either direction
+class FixedGroupAbsvalNormCohensHParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_cohens_h_paraphrase",
+                lambda scd: np.abs(
+                    normalized_cohens_h(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+
+
+class FixedGroupAbsvalNormCohensHParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_cohens_h_paraphrase",
+                lambda scd: np.abs(
+                    normalized_cohens_h(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+
+
+class FixedGroupAbsvalNormHedgesGParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_hedges_g_paraphrase",
+                lambda scd: np.abs(
+                    normalized_hedges_g(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+
+
+class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_hedges_g_paraphrase",
+                lambda scd: np.abs(
+                    normalized_hedges_g(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }

From 3593542ee4bbdcc1e1090fb406e96fac53f0afc2 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 18:19:24 +0200
Subject: [PATCH 80/83] add absolute value version of Hedges G / Cohens H to
 tests

---
 prepare/metrics/grouped_instance_metrics.py | 168 ++++++++++++++++----
 tests/library/test_metrics.py               |  87 ++++++----
 2 files changed, 194 insertions(+), 61 deletions(-)

diff --git a/prepare/metrics/grouped_instance_metrics.py b/prepare/metrics/grouped_instance_metrics.py
index 085efedd4a..0e5363aa78 100644
--- a/prepare/metrics/grouped_instance_metrics.py
+++ b/prepare/metrics/grouped_instance_metrics.py
@@ -1,9 +1,9 @@
-from copy import deepcopy
-
-import numpy as np
-
 from src.unitxt import add_to_catalog
 from src.unitxt.metrics import (
+    FixedGroupAbsvalNormCohensHParaphraseAccuracy,
+    FixedGroupAbsvalNormCohensHParaphraseStringContainment,
+    FixedGroupAbsvalNormHedgesGParaphraseAccuracy,
+    FixedGroupAbsvalNormHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -58,33 +58,26 @@
     [" abcdefg", "AB", "abcd"],
 ]
 
-# possibly multi-column group identifier; 'ignore' is unused
-# use deepcopy so that dicts in list are independent and can be updated separately
-additional_inputs = (
-    [deepcopy({"group": "grp1", "id": 0, "ignore": 1}) for _ in range(5)]
-    + [deepcopy({"group": "grp1", "id": 1, "ignore": 1}) for _ in range(5)]
-    + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
-    + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
-)
-# for group_mean aggregations with a subgroup_comparison, add a variant_type label
-# these groupings correspond in length to the group identifiers above
-variant_type = np.concatenate(
-    [
-        np.repeat(a=["original", "paraphrase"], repeats=reps)
-        for reps in [[1, 4], [1, 4], [1, 3], [1, 0]]
-    ]
-).tolist()
-
-# construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
-group_by_fields = ["group", "id"]
-
-for ai, vt in zip(additional_inputs, variant_type):
-    ai.update(
-        {
-            "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "variant_type": vt,
-        }
-    )
+# additional inputs, consisting of a group_id (group instance scores by this, then apply aggregation function)
+# and variant_type (for metrics that compare, say original vs paraphrase instance score)
+# create 4 groups, of sizes 5,5,4,1
+additional_inputs = [
+    {"group_id": "group1", "variant_type": "original"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "original"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "original"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group4", "variant_type": "original"},
+]
 
 
 instance_targets_string_containment = [
@@ -502,6 +495,119 @@
     overwrite=True,
 )
 
+# absolute value of above metrics
+
+metric = FixedGroupAbsvalNormCohensHParaphraseAccuracy()
+global_target = {
+    "fixed_group_absval_norm_cohens_h_paraphrase_accuracy": 0.65,
+    "score": 0.65,
+    "score_name": "fixed_group_absval_norm_cohens_h_paraphrase_accuracy",
+    "score_ci_low": 0.33,
+    "score_ci_high": 1.0,
+    "fixed_group_absval_norm_cohens_h_paraphrase_accuracy_ci_low": 0.33,
+    "fixed_group_absval_norm_cohens_h_paraphrase_accuracy_ci_high": 1.0,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.robustness.fixed_group_absval_norm_cohens_h_paraphrase_accuracy",
+    overwrite=True,
+)
+
+
+metric = FixedGroupAbsvalNormCohensHParaphraseStringContainment()
+global_target = {
+    "fixed_group_absval_norm_cohens_h_paraphrase_string_containment": 0.46,
+    "score": 0.46,
+    "score_name": "fixed_group_absval_norm_cohens_h_paraphrase_string_containment",
+    "score_ci_low": 0.39,
+    "score_ci_high": 0.5,
+    "fixed_group_absval_norm_cohens_h_paraphrase_string_containment_ci_low": 0.39,
+    "fixed_group_absval_norm_cohens_h_paraphrase_string_containment_ci_high": 0.5,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.robustness.fixed_group_absval_norm_cohens_h_paraphrase_string_containment",
+    overwrite=True,
+)
+
+
+metric = FixedGroupAbsvalNormHedgesGParaphraseAccuracy()
+global_target = {
+    "fixed_group_absval_norm_hedges_g_paraphrase_accuracy": 0.38,
+    "score": 0.38,
+    "score_name": "fixed_group_absval_norm_hedges_g_paraphrase_accuracy",
+    "score_ci_low": 0.06,
+    "score_ci_high": 1.0,
+    "fixed_group_absval_norm_hedges_g_paraphrase_accuracy_ci_low": 0.06,
+    "fixed_group_absval_norm_hedges_g_paraphrase_accuracy_ci_high": 1.0,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_accuracy,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.robustness.fixed_group_absval_norm_hedges_g_paraphrase_accuracy",
+    overwrite=True,
+)
+
+
+metric = FixedGroupAbsvalNormHedgesGParaphraseStringContainment()
+global_target = {
+    "fixed_group_absval_norm_hedges_g_paraphrase_string_containment": 0.08,
+    "score": 0.08,
+    "score_name": "fixed_group_absval_norm_hedges_g_paraphrase_string_containment",
+    "score_ci_low": 0.05,
+    "score_ci_high": 0.1,
+    "fixed_group_absval_norm_hedges_g_paraphrase_string_containment_ci_low": 0.05,
+    "fixed_group_absval_norm_hedges_g_paraphrase_string_containment_ci_high": 0.1,
+}
+
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets_string_containment,
+    global_target=global_target,
+    additional_inputs=additional_inputs,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.robustness.fixed_group_absval_norm_hedges_g_paraphrase_string_containment",
+    overwrite=True,
+)
+
 # TokenOverlap: example of a metric that has more than one score
 
 global_target = {
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
index 0d0413376d..baaa9faea2 100644
--- a/tests/library/test_metrics.py
+++ b/tests/library/test_metrics.py
@@ -1,8 +1,5 @@
-from copy import deepcopy
 from math import isnan
 
-import numpy as np
-
 from src.unitxt.logging_utils import get_logger
 from src.unitxt.metrics import (
     Accuracy,
@@ -11,6 +8,10 @@
     F1Micro,
     F1MicroMultiLabel,
     F1Weighted,
+    FixedGroupAbsvalNormCohensHParaphraseAccuracy,
+    FixedGroupAbsvalNormCohensHParaphraseStringContainment,
+    FixedGroupAbsvalNormHedgesGParaphraseAccuracy,
+    FixedGroupAbsvalNormHedgesGParaphraseStringContainment,
     FixedGroupMeanAccuracy,
     FixedGroupMeanBaselineAccuracy,
     FixedGroupMeanBaselineStringContainment,
@@ -72,33 +73,26 @@
     [" abcdefg", "AB", "abcd"],
 ]
 
-# possibly multi-column group identifier
-GROUPED_INSTANCE_ADDL_INPUTS = (
-    [deepcopy({"group": "grp1", "id": 0, "ignore": 1}) for _ in range(5)]
-    + [deepcopy({"group": "grp1", "id": 1, "ignore": 1}) for _ in range(5)]
-    + [deepcopy({"group": "grp2", "id": 0, "ignore": 1}) for _ in range(4)]
-    + [deepcopy({"group": "grp2", "id": 1, "ignore": 0}) for _ in range(1)]
-)
-
-# for group_mean aggregations with a subgroup_comparison, add a variant_type label
-# these groupings correspond in length to the group identifiers above
-VARIANT_TYPE = np.concatenate(
-    [
-        np.repeat(a=["original", "paraphrase"], repeats=reps)
-        for reps in [[1, 4], [1, 4], [1, 3], [1, 0]]
-    ]
-).tolist()
-
-# construct grouping_field by combining two other fields (and ignoring one); mimics what you would do in cards
-group_by_fields = ["group", "id"]
-
-for ai, vt in zip(GROUPED_INSTANCE_ADDL_INPUTS, VARIANT_TYPE):
-    ai.update(
-        {
-            "group_id": "_".join([str(ai[ff]) for ff in group_by_fields]),
-            "variant_type": vt,
-        }
-    )
+# additional inputs, consisting of a group_id (group instance scores by this, then apply aggregation function)
+# and variant_type (for metrics that compare, say original vs paraphrase instance score)
+# create 4 groups, of sizes 5,5,4,1
+GROUPED_INSTANCE_ADDL_INPUTS = [
+    {"group_id": "group1", "variant_type": "original"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group1", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "original"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group2", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "original"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group3", "variant_type": "paraphrase"},
+    {"group_id": "group4", "variant_type": "original"},
+]
 
 
 class TestMetrics(UnitxtTestCase):
@@ -488,6 +482,10 @@ def test_grouped_instance_metrics(self):
             FixedGroupPDRParaphraseStringContainment(),
             FixedGroupNormHedgesGParaphraseAccuracy(),
             FixedGroupNormHedgesGParaphraseStringContainment(),
+            FixedGroupAbsvalNormCohensHParaphraseAccuracy(),
+            FixedGroupAbsvalNormCohensHParaphraseStringContainment(),
+            FixedGroupAbsvalNormHedgesGParaphraseAccuracy(),
+            FixedGroupAbsvalNormHedgesGParaphraseStringContainment(),
         ]
         global_targets = [
             0.225,
@@ -505,6 +503,10 @@ def test_grouped_instance_metrics(self):
             0.4444444444444445,
             -0.34565986391520215,
             -0.08060156608173413,
+            0.6471689271009087,
+            0.4639421840102023,
+            0.3832160660602437,
+            0.08060156608173413,
         ]
         for metric, target in zip(accuracy_metrics, global_targets):
             outputs = apply_metric(
@@ -759,6 +761,31 @@ def test_grouped_instance_metric_confidence_interval(self):
             expected_ci_high=-0.046656947481584346,
         )
 
+        # absolute value of Hedges' g and Cohen's h
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupAbsvalNormCohensHParaphraseAccuracy(),
+            expected_ci_low=0.33333333333333337,
+            expected_ci_high=1.0,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupAbsvalNormCohensHParaphraseStringContainment(),
+            expected_ci_low=0.39182655203060723,
+            expected_ci_high=0.49999999999999994,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupAbsvalNormHedgesGParaphraseAccuracy(),
+            expected_ci_low=0.05633430321756243,
+            expected_ci_high=1.0,
+        )
+
+        self._test_grouped_instance_confidence_interval(
+            metric=FixedGroupAbsvalNormHedgesGParaphraseStringContainment(),
+            expected_ci_low=0.046656947481584346,
+            expected_ci_high=0.09757387538180902,
+        )
+
         # pass global dict because there are additional fields other than the main score
         self._test_grouped_instance_confidence_interval(
             metric=GroupMeanTokenOverlap(),

From 624953872291d3ec3952fb7fa9b350a76303603d Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 21:20:51 +0200
Subject: [PATCH 81/83] changes to global metric confidence interval now
 resample non-NaN values, so CI will not be NaN

---
 prepare/metrics/roc_auc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py
index 7a64ba53eb..4048a203fc 100644
--- a/prepare/metrics/roc_auc.py
+++ b/prepare/metrics/roc_auc.py
@@ -12,11 +12,11 @@
 instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
 global_targets = {
     "roc_auc": 0.5,
-    "roc_auc_ci_high": np.nan,
-    "roc_auc_ci_low": np.nan,
+    "roc_auc_ci_high": 0.5,
+    "roc_auc_ci_low": 0.9,
     "score": 0.5,
-    "score_ci_high": np.nan,
-    "score_ci_low": np.nan,
+    "score_ci_high": 0.5,
+    "score_ci_low": 0.9,
     "score_name": "roc_auc",
 }
 

From 651dd844d0eaae3b0b5dfc09e3d2ad0e703a9961 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 21:46:19 +0200
Subject: [PATCH 82/83] Revert "changes to global metric confidence interval
 now resample non-NaN values, so CI will not be NaN"

This reverts commit 624953872291d3ec3952fb7fa9b350a76303603d.
---
 prepare/metrics/roc_auc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py
index 4048a203fc..7a64ba53eb 100644
--- a/prepare/metrics/roc_auc.py
+++ b/prepare/metrics/roc_auc.py
@@ -12,11 +12,11 @@
 instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
 global_targets = {
     "roc_auc": 0.5,
-    "roc_auc_ci_high": 0.5,
-    "roc_auc_ci_low": 0.9,
+    "roc_auc_ci_high": np.nan,
+    "roc_auc_ci_low": np.nan,
     "score": 0.5,
-    "score_ci_high": 0.5,
-    "score_ci_low": 0.9,
+    "score_ci_high": np.nan,
+    "score_ci_low": np.nan,
     "score_name": "roc_auc",
 }
 

From 34af53b08f7068c62cc480322587a5a0db989e76 Mon Sep 17 00:00:00 2001
From: Samuel Ackerman <samuel.ackerman@ibm.com>
Date: Tue, 20 Feb 2024 21:47:08 +0200
Subject: [PATCH 83/83] changes to global metric confidence interval now
 resample non-NaN values, so CI will not be NaN

---
 prepare/metrics/roc_auc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/prepare/metrics/roc_auc.py b/prepare/metrics/roc_auc.py
index 7a64ba53eb..1b2f056e6f 100644
--- a/prepare/metrics/roc_auc.py
+++ b/prepare/metrics/roc_auc.py
@@ -12,11 +12,11 @@
 instance_targets = [{"roc_auc": np.nan, "score": np.nan, "score_name": "roc_auc"}] * 3
 global_targets = {
     "roc_auc": 0.5,
-    "roc_auc_ci_high": np.nan,
-    "roc_auc_ci_low": np.nan,
+    "roc_auc_ci_high": 0.9,
+    "roc_auc_ci_low": 0.5,
     "score": 0.5,
-    "score_ci_high": np.nan,
-    "score_ci_low": np.nan,
+    "score_ci_high": 0.9,
+    "score_ci_low": 0.5,
     "score_name": "roc_auc",
 }