From e54cdf59deaed9b28c5be0be50deca5f986c5bf1 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 27 Feb 2024 11:23:21 +0100
Subject: [PATCH 01/20] Added 'HyperLoss' class and phi2 loss evaluation

---
 n3fit/runcards/examples/Basic_hyperopt.yml    |   2 +-
 n3fit/src/n3fit/checks.py                     |  29 +-
 .../n3fit/hyper_optimization/hyper_scan.py    |  23 +-
 .../src/n3fit/hyper_optimization/penalties.py |  43 ++-
 n3fit/src/n3fit/hyper_optimization/rewards.py | 296 +++++++++++++++++-
 n3fit/src/n3fit/model_trainer.py              | 160 +++++++---
 n3fit/src/n3fit/performfit.py                 |   6 +-
 n3fit/src/n3fit/tests/test_hyperopt.py        |  98 +++++-
 n3fit/src/n3fit/vpinterface.py                |  56 ++++
 9 files changed, 634 insertions(+), 79 deletions(-)

diff --git a/n3fit/runcards/examples/Basic_hyperopt.yml b/n3fit/runcards/examples/Basic_hyperopt.yml
index 6b5393a870..1bff746fab 100644
--- a/n3fit/runcards/examples/Basic_hyperopt.yml
+++ b/n3fit/runcards/examples/Basic_hyperopt.yml
@@ -81,7 +81,7 @@ hyperscan_config:
       activations: ['sigmoid', 'tanh']
 
 kfold:
-  target: average
+  fold_statistic: average
   penalties:
     - saturation
     - patience
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 6b37f46bd3..0b5416e2d9 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -9,6 +9,7 @@
 
 from n3fit.hyper_optimization import penalties as penalties_module
 from n3fit.hyper_optimization import rewards as rewards_module
+from n3fit.hyper_optimization.rewards import HyperLoss
 from reportengine.checks import CheckError, make_argcheck
 from validphys.core import PDF
 from validphys.pdfbases import check_basis
@@ -255,15 +256,33 @@ def check_kfold_options(kfold):
             raise CheckError(
                 f"The penalty '{penalty}' is not recognized, ensure it is implemented in hyper_optimization/penalties.py"
             )
-    loss_target = kfold.get("target")
-    if loss_target is not None:
-        if not hasattr(rewards_module, loss_target):
+
+    loss_type = kfold.get("loss_type")
+    if loss_type is not None:
+        if loss_type not in HyperLoss().implemented_losses:
+            raise CheckError(
+                f"Loss type '{loss_type}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py."
+                "Options so far are 'chi2' or 'phi2'."
+            )
+    replica_statistic = kfold.get("replica_statistic")
+    if replica_statistic is not None:
+        if replica_statistic not in HyperLoss().implemented_stats:
             raise CheckError(
-                f"The hyperoptimization target '{loss_target}' loss is not recognized, "
-                "ensure it is implemented in hyper_optimization/rewards.py"
+                f"The replica statistic '{replica_statistic}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
             )
+    fold_statistic = kfold.get("fold_statistic")
+    if fold_statistic is not None:
+        if fold_statistic not in HyperLoss().implemented_stats:
+            raise CheckError(
+                f"The fold statistic '{fold_statistic}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
+            )
+
     partitions = kfold["partitions"]
     # Check specific errors for specific targets
+    loss_target = kfold.get("fold_statistic")  # TODO: haven't updated this
     if loss_target == "fit_future_tests":
         if len(partitions) == 1:
             raise CheckError("Cannot use target 'fit_future_tests' with just one partition")
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
index f06234ff6a..561a23283e 100644
--- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
+++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -14,6 +14,7 @@
 """
 import copy
 import logging
+from typing import Callable
 
 import hyperopt
 from hyperopt.pyll.base import scope
@@ -24,6 +25,11 @@
 
 log = logging.getLogger(__name__)
 
+# Hyperopt uses these strings for a passed and failed run
+# it also has statusses "new", "running" and "suspended", but we don't use them
+HYPEROPT_STATUSSES = {True: "ok", False: "fail"}
+
+
 HYPEROPT_SEED = 42
 
 
@@ -118,7 +124,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
         parameters of the best trial as found by ``hyperopt``
     """
     # Tell the trainer we are doing hpyeropt
-    model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK)
+    model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys)
     # Generate the trials object
     trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
     # Initialize seed for hyperopt
@@ -132,7 +138,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
 
     # Perform the scan
     best = hyperopt.fmin(
-        fn=model_trainer.hyperparametrizable,
+        fn=_status_wrapper(model_trainer.hyperparametrizable),
         space=hyperscanner.as_dict(),
         algo=hyperopt.tpe.suggest,
         max_evals=max_evals,
@@ -144,6 +150,19 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     return hyperscanner.space_eval(best)
 
 
+def _status_wrapper(hyperparametrizable: Callable) -> Callable:
+    """
+    Wrapper that just converts the "status" value to hyperopt's conventions.
+    """
+
+    def wrapped(*args, **kwargs):
+        results_dict = hyperparametrizable(*args, **kwargs)
+        results_dict["status"] = HYPEROPT_STATUSSES[results_dict["status"]]
+        return results_dict
+
+    return wrapped
+
+
 class ActivationStr:
     """
     Upon call this class returns an array where the activation function
diff --git a/n3fit/src/n3fit/hyper_optimization/penalties.py b/n3fit/src/n3fit/hyper_optimization/penalties.py
index 77a156cd3a..4afb79b4f9 100644
--- a/n3fit/src/n3fit/hyper_optimization/penalties.py
+++ b/n3fit/src/n3fit/hyper_optimization/penalties.py
@@ -40,6 +40,11 @@ def saturation(pdf_model=None, n=100, min_x=1e-6, max_x=1e-4, flavors=None, **_k
         flavors: list(int)
             indices of the flavors to inspect
 
+    Returns
+    -------
+        NDArray
+            array of saturation penalties for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import saturation
@@ -72,8 +77,9 @@ def saturation(pdf_model=None, n=100, min_x=1e-6, max_x=1e-4, flavors=None, **_k
     return extra_loss
 
 
-def patience(stopping_object=None, alpha=1e-4, **_kwargs):
-    """Adds a penalty for fits that have finished too soon, which
+def patience(stopping_object, alpha: float = 1e-4, **_kwargs):
+    """
+    Adds a penalty for fits that have finished too soon, which
     means the number of epochs or its patience is not optimal.
     The penalty is proportional to the validation loss and will be 0
     when the best epoch is exactly at max_epoch - patience
@@ -85,6 +91,11 @@ def patience(stopping_object=None, alpha=1e-4, **_kwargs):
         alpha: float
             dumping factor for the exponent
 
+    Returns
+    -------
+        NDArray
+            patience penalty for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import patience
@@ -94,11 +105,11 @@ def patience(stopping_object=None, alpha=1e-4, **_kwargs):
     3.434143467595683
 
     """
-    epoch_best = np.take(stopping_object.e_best_chi2, 0)
+    epoch_best = np.array(stopping_object.e_best_chi2)
     patience = stopping_object.stopping_patience
     max_epochs = stopping_object.total_epochs
     diff = abs(max_epochs - patience - epoch_best)
-    vl_loss = np.take(stopping_object.vl_chi2, 0)
+    vl_loss = np.array(stopping_object.vl_chi2)
     return vl_loss * np.exp(alpha * diff)
 
 
@@ -109,6 +120,11 @@ def integrability(pdf_model=None, **_kwargs):
 
     The penalty increases exponentially with the growth of the integrability number
 
+    Returns
+    -------
+        NDArray
+            array of integrability penalties for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import integrability
@@ -121,8 +137,19 @@ def integrability(pdf_model=None, **_kwargs):
     """
     pdf_instance = N3PDF(pdf_model.split_replicas())
     integ_values = integrability_numbers(pdf_instance)
-    integ_overflow = np.sum(integ_values[integ_values > fitveto.INTEG_THRESHOLD])
-    if integ_overflow > 50.0:
-        # before reaching an overflow, just give a stupidly big number
-        return np.exp(50.0)
+
+    # set components under the threshold to 0
+    integ_values[integ_values <= fitveto.INTEG_THRESHOLD] = 0.0
+
+    # sum over flavours
+    integ_overflow = np.sum(integ_values, axis=-1)  # -1 rather than 1 so it works with 1 replica
+
+    # Limit components to 50 to avoid overflow
+    if isinstance(integ_overflow, np.ndarray):
+        # Case: multi-replica scenario
+        integ_overflow[integ_overflow > 50.0] = 50.0
+    elif isinstance(integ_overflow, (float, np.float64)):
+        # Case: single replica scenario
+        integ_overflow = min(integ_overflow, 50.0)
+
     return np.exp(integ_overflow) - 1.0
diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index 0587a47649..af97aa5bf7 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -24,30 +24,300 @@
     std: 0.8925
 
 """
+import logging
+from typing import Callable, Dict, List
+
 import numpy as np
 
-from n3fit.vpinterface import N3PDF
+from n3fit.backends import MetaModel
+from n3fit.vpinterface import N3PDF, compute_phi2
+from validphys.core import DataGroupSpec
 from validphys.pdfgrids import distance_grids, xplotting_grid
 
+log = logging.getLogger(__name__)
+
 
 def _pdfs_to_n3pdfs(pdfs_per_fold):
     """Convert a list of multi-replica PDFs to a list of N3PDFs"""
     return [N3PDF(pdf.split_replicas(), name=f"fold_{k}") for k, pdf in enumerate(pdfs_per_fold)]
 
 
-def average(fold_losses=None, **_kwargs):
-    """Returns the average of fold losses"""
-    return np.average(fold_losses)
-
-
-def best_worst(fold_losses=None, **_kwargs):
-    """Returns the maximum loss of all k folds"""
-    return np.max(fold_losses)
-
+class HyperLoss:
+    """
+    Class to compute the hyper_loss based on the individual replica losses.
+
+    Computes the statistic over the replicas and then over the folds, both
+    statistics default to the average.
+
+    Parameters
+    ----------
+        loss_type: str
+            the type of loss over the replicas to use.
+            Options are "chi2" and "phi2".
+        replica_statistic: str
+            the statistic over the replicas to use, for per replica losses.
+            Options are "average", "best_worst", and "std".
+        fold_statistic: str
+            the statistic over the folds to use.
+            Options are "average", "best_worst", and "std".
+    """
 
-def std(fold_losses=None, **_kwargs):
-    """Return the standard dev of the losses of the folds"""
-    return np.std(fold_losses)
+    def __init__(
+        self, loss_type: str = None, replica_statistic: str = None, fold_statistic: str = None
+    ):
+        self.implemented_stats = {
+            "average": self._average,
+            "best_worst": self._best_worst,
+            "std": self._std,
+        }
+        self.implemented_losses = ["chi2", "phi2"]
+
+        self._default_statistic = "average"
+        self._default_loss = "chi2"
+
+        self.loss_type = self._parse_loss(loss_type)
+        self.reduce_over_replicas = self._parse_statistic(replica_statistic, "replica_statistic")
+        self.reduce_over_folds = self._parse_statistic(fold_statistic, "fold_statistic")
+
+        self.phi2_vector = []
+        self.chi2_matrix = []
+
+        self.penalties = {}
+
+    def compute_loss(
+        self,
+        penalties: Dict[str, np.ndarray],
+        experimental_loss: np.ndarray,
+        pdf_model: MetaModel,
+        experimental_data: List[DataGroupSpec],
+        fold_idx: int = 0,
+    ) -> float:
+        """
+        Compute the loss, including added penalties, for a single fold.
+
+        Parameters
+        ----------
+            penalties: Dict[str, NDArray(replicas)]
+                Dict of penalties for each replica.
+            experimental_loss: NDArray(replicas)
+                Experimental loss for each replica.
+            pdf_model: :class:`n3fit.backends.MetaModel`
+                N3fitted meta-model.
+            experimental_data: List[validphys.core.DataGroupSpec]
+                List of tuples containing `validphys.core.DataGroupSpec` instances for each group data set
+            fold_idx: int
+                k-fold index. Defaults to 0.
+
+        Returns
+        -------
+            loss: float
+                The computed loss over the replicas.
+
+        Example
+        -------
+        >>> import numpy as np
+        >>> from n3fit.hyper_optimization.rewards import HyperLoss
+        >>> from n3fit.model_gen import generate_pdf_model
+        >>> from validphys.loader import Loader
+        >>> hyper = HyperLoss(loss_type="chi2", replica_statistic="average", fold_statistic="average")
+        >>> penalties = {'saturation': np.array([1.0, 5.0])}
+        >>> experimental_loss = np.array([0.1, 0.2])
+        >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
+        >>> experimental_data = [Loader().check_experiment("My DataGroupSpec", [ds])]
+        >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'g', 's', 'sbar']]
+        >>> pdf_model = generate_pdf_model(nodes=[8], activations=['linear'], seed=0, num_replicas=2, flav_info=fake_fl, fitbasis="FLAVOUR")
+        >>> loss = hyper.compute_loss(penalties, experimental_loss, pdf_model, experimental_data)
+        """
+        # calculate phi2 for a given k-fold using vpinterface and validphys
+        phi2_per_fold = compute_phi2(N3PDF(pdf_model.split_replicas()), experimental_data)
+
+        # update hyperopt metrics
+        # these are saved in the phi2_vector and chi2_matrix attributes, excluding penalties
+        self._save_hyperopt_metrics(phi2_per_fold, experimental_loss, penalties, fold_idx)
+
+        # include penalties to experimental loss
+        # this allows introduction of statistics also to penalties
+        experimental_loss_w_penalties = experimental_loss + sum(penalties.values())
+
+        # add penalties to phi2 in the form of a sum of per-replicas averages
+        phi2_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
+
+        # define loss for hyperopt according to the chosen loss_type
+        if self.loss_type == "chi2":
+            # calculate statistics of chi2 over replicas for a given k-fold
+            loss = self.reduce_over_replicas(experimental_loss_w_penalties)
+        elif self.loss_type == "phi2":
+            loss = phi2_per_fold
+
+        return loss
+
+    def _save_hyperopt_metrics(
+        self,
+        phi2_per_fold: float,
+        chi2_per_fold: np.ndarray,
+        penalties: Dict[str, np.ndarray],
+        fold_idx: int = 0,
+    ) -> None:
+        """
+        Save all chi2 and phi2 calculated metrics per replica and per fold, including penalties.
+
+        Parameters
+        ----------
+            phi2_per_fold: float
+                Computed phi2 for a given k-fold
+            chi2_per_fold: np.ndarray
+                Computed chi2 for each replica for a given k-fold
+            penalties: Dict[str, np.ndarray]
+                dictionary of all penalties with their names
+            fold_idx: int
+                k-fold index. Defaults to 0.
+        """
+        # reset chi2 and phi2 arrays for every trial
+        if fold_idx == 0:
+            self.phi2_vector = []
+            self.chi2_matrix = []
+            self.penalties = {}
+
+        # populate chi2 matrix and phi2 vector calculated for a given k-fold
+        self.chi2_matrix.append(chi2_per_fold)
+        self.phi2_vector.append(phi2_per_fold)
+
+        # save penalties per replica for a given k-fold
+        for name, values in penalties.items():
+            temp = self.penalties.get(name, [])
+            temp.append(values)
+            self.penalties[name] = temp
+
+    def _parse_loss(self, loss_type: str) -> str:
+        """
+        Parse the type of loss and return the default if None.
+
+        Parameters
+        ----------
+            loss_type: str
+                The loss type to parse.
+
+        Returns
+        -------
+            loss_type: str
+                The parsed loss type.
+
+        Raises
+        ------
+            ValueError: If an invalid loss type is provided.
+        """
+        if loss_type is None:
+            loss_type = self._default_loss
+            log.warning(f"No loss_type selected in HyperLoss, defaulting to {loss_type}")
+        else:
+            if loss_type not in self.implemented_losses:
+                valid_options = ", ".join(self.implemented_losses)
+                raise ValueError(
+                    f"Invalid loss type '{loss_type}'. Valid options are: {valid_options}"
+                )
+
+        log.info(f"Setting '{loss_type}' as the loss type for hyperoptimization")
+
+        return loss_type
+
+    def _parse_statistic(self, statistic: str, name: str) -> Callable:
+        """
+        Parse the statistic and return the default if None.
+
+        Parameters
+        ----------
+            statistic: str
+                The statistic to parse.
+            name: str
+                The name of the statistic.
+
+        Returns
+        -------
+            Callable: The parsed statistic method.
+
+        Raises
+        ------
+            ValueError: If an invalid statistic is provided.
+
+        Notes
+        -----
+            For loss type equal to phi2, the applied fold statistics is always the reciprocal of the selected stats.
+        """
+        if statistic is None:
+            statistic = self._default_statistic
+            log.warning(f"No {name} selected in HyperLoss, defaulting to {statistic}")
+        else:
+            if statistic not in self.implemented_stats:
+                valid_options = ", ".join(self.implemented_stats.keys())
+                raise ValueError(
+                    f"Invalid {name} '{statistic}'. Valid options are: {valid_options}"
+                )
+
+        log.info(f"Using '{statistic}' as the {name} for hyperoptimization")
+
+        selected_statistic = self.implemented_stats[statistic]
+
+        if self.loss_type == "chi2":
+            return selected_statistic
+
+        elif self.loss_type == "phi2":
+            # In case of phi2, calculate the inverse of the applied statistics
+            # This is only used when calculating statistics over folds
+            return lambda x: np.reciprocal(selected_statistic(x))
+
+    @staticmethod
+    def _average(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the average of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the mean is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The average along the specified axis.
+        """
+        return np.average(fold_losses, axis=axis).item()
+
+    @staticmethod
+    def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the maximum value of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the maximum is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The maximum value along the specified axis.
+        """
+        return np.max(fold_losses, axis=axis).item()
+
+    @staticmethod
+    def _std(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the standard deviation of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the standard deviation is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The standard deviation along the specified axis.
+        """
+        return np.std(fold_losses, axis=axis).item()
 
 
 def fit_distance(pdfs_per_fold=None, **_kwargs):
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 59acff377a..8db9acfaec 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -19,8 +19,11 @@
 from n3fit.backends import operations as op
 import n3fit.hyper_optimization.penalties
 import n3fit.hyper_optimization.rewards
+from n3fit.hyper_optimization.rewards import HyperLoss
 from n3fit.scaler import generate_scaler
 from n3fit.stopping import Stopping
+from n3fit.vpinterface import N3PDF, compute_phi2
+from validphys.core import DataGroupSpec
 from validphys.photon.compute import Photon
 
 log = logging.getLogger(__name__)
@@ -88,14 +91,13 @@ class ModelTrainer:
 
     def __init__(
         self,
+        experiments_data,
         exp_info,
         pos_info,
         integ_info,
         flavinfo,
         fitbasis,
         nnseeds,
-        pass_status="ok",
-        failed_status="fail",
         debug=False,
         kfold_parameters=None,
         max_cores=None,
@@ -103,11 +105,13 @@ def __init__(
         sum_rules=None,
         theoryid=None,
         lux_params=None,
-        replica_idxs=None,
+        replicas=None,
     ):
         """
         Parameters
         ----------
+            experiments_data: list
+                list of `validphys.core.DataGroupSpec` containing experiments
             exp_info: list
                 list of dictionaries containing experiments
             pos_info: list
@@ -120,10 +124,6 @@ def __init__(
                 the name of the basis being fitted
             nnseeds: list(int)
                 the seed used to initialise the NN for each model to be passed to model_gen
-            pass_status: str
-                flag to signal a good run
-            failed_status: str
-                flag to signal a bad run
             debug: bool
                 flag to activate some debug options
             kfold_parameters: dict
@@ -149,14 +149,13 @@ def __init__(
         self.flavinfo = flavinfo
         self.fitbasis = fitbasis
         self._nn_seeds = nnseeds
-        self.pass_status = pass_status
-        self.failed_status = failed_status
         self.debug = debug
         self.all_datasets = []
         self._scaler = None
         self.theoryid = theoryid
         self.lux_params = lux_params
-        self.replica_idxs = replica_idxs
+        self.replicas = replicas
+        self.experiments_data = experiments_data
 
         # Initialise internal variables which define behaviour
         if debug:
@@ -182,12 +181,14 @@ def __init__(
                 self.hyper_penalties.append(pen_fun)
                 log.info("Adding penalty: %s", penalty)
             # Check what is the hyperoptimization target function
-            hyper_loss = kfold_parameters.get("target", None)
-            if hyper_loss is None:
-                hyper_loss = "average"
-                log.warning("No minimization target selected, defaulting to '%s'", hyper_loss)
-            log.info("Using '%s' as the target for hyperoptimization", hyper_loss)
-            self._hyper_loss = getattr(n3fit.hyper_optimization.rewards, hyper_loss)
+            replica_statistic = kfold_parameters.get("replica_statistic", None)
+            fold_statistic = kfold_parameters.get("fold_statistic", None)
+            loss_type = kfold_parameters.get("loss_type", None)
+            self._hyper_loss = HyperLoss(
+                loss_type=loss_type,
+                replica_statistic=replica_statistic,
+                fold_statistic=fold_statistic,
+            )
 
         # Initialize the dictionaries which contain all fitting information
         self.input_list = []
@@ -229,9 +230,8 @@ def __init__(
         if debug:
             self.callbacks.append(callbacks.TimerCallback())
 
-    def set_hyperopt(self, hyperopt_on, keys=None, status_ok="ok"):
+    def set_hyperopt(self, hyperopt_on, keys=None):
         """Set hyperopt options on and off (mostly suppresses some printing)"""
-        self.pass_status = status_ok
         if keys is None:
             keys = []
         self._hyperkeys = keys
@@ -684,7 +684,7 @@ def _generate_pdf(
             regularizer_args=regularizer_args,
             impose_sumrule=self.impose_sumrule,
             scaler=self._scaler,
-            num_replicas=len(self.replica_idxs),
+            num_replicas=len(self.replicas),
             photons=photons,
         )
         return pdf_model
@@ -710,7 +710,7 @@ def _prepare_reporting(self, partition):
             reporting_list.append(reporting_dict)
         return reporting_list
 
-    def _train_and_fit(self, training_model, stopping_object, epochs=100):
+    def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool:
         """
         Trains the NN for the number of epochs given using
         stopping_object as the stopping criteria
@@ -740,9 +740,8 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100):
 
         # TODO: in order to use multireplica in hyperopt is is necessary to define what "passing" means
         # for now consider the run as good if any replica passed
-        if any(bool(i) for i in stopping_object.e_best_chi2):
-            return self.pass_status
-        return self.failed_status
+        fit_has_passed = any(bool(i) for i in stopping_object.e_best_chi2)
+        return fit_has_passed
 
     def _hyperopt_override(self, params):
         """Unrolls complicated hyperopt structures into very simple dictionaries"""
@@ -798,6 +797,43 @@ def evaluate(self, stopping_object):
         exp_chi2 = self.experimental["model"].compute_losses()["loss"] / self.experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
+    def _filter_datagroupspec(self, datasets_partition):
+        """Takes a list of all input exp datasets as :class:`validphys.core.DataGroupSpec`
+        and select `DataSetSpec`s whose names are in datasets_partition.
+
+        Parameters
+        ----------
+            datasets_partition: List[str]
+                List with names of the datasets you want to select.
+
+        Returns
+        -------
+            filtered_datagroupspec: List[validphys.core.DataGroupSpec]
+                List of filtered exp datasets whose names are in datasets_partition.
+        """
+        filtered_datagroupspec = []
+
+        # self.experiments_data is composed of a list of `DataGroupSpec` objects
+        # These represent a group of related exp data sets
+        # Loop over this list
+        for datagroup in self.experiments_data:
+            filtered_datasetspec = []
+
+            # Each `DataGroupSpec` is composed by several `DataSetSpec` objects
+            # `DataSetSpec` represents each exp dataset
+            # Now, loop over them
+            for dataset in datagroup.datasets:
+                # Include `DataSetSpec`s whose names are in datasets_partition
+                if dataset.name in datasets_partition:
+                    filtered_datasetspec.append(dataset)
+
+            # List of filtered experiments as `DataGroupSpec`
+            filtered_datagroupspec.append(
+                DataGroupSpec(name=f"{datagroup.name}_exp", datasets=filtered_datasetspec)
+            )
+
+        return filtered_datagroupspec
+
     def hyperparametrizable(self, params):
         """
         Wrapper around all the functions defining the fit.
@@ -853,6 +889,8 @@ def hyperparametrizable(self, params):
         # And lists to save hyperopt utilities
         pdfs_per_fold = []
         exp_models = []
+        # phi2 evaluated over training/validation exp data
+        trvl_phi2_per_fold = []
 
         # Generate the grid in x, note this is the same for all partitions
         xinput = self._xgrid_generation()
@@ -860,7 +898,7 @@ def hyperparametrizable(self, params):
         # Initialize all photon classes for the different replicas:
         if self.lux_params:
             photons = Photon(
-                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replica_idxs
+                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas
             )
         else:
             photons = None
@@ -937,33 +975,61 @@ def hyperparametrizable(self, params):
             passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs)
 
             if self.mode_hyperopt:
-                # If doing a hyperparameter scan we need to keep track of the loss function
-                # Since hyperopt needs _one_ number take the average in case of many replicas
-                validation_loss = np.mean(stopping_object.vl_chi2)
+                if not passed:
+                    log.info("Hyperparameter combination fail to find a good fit, breaking")
+                    break
+
+                validation_loss = stopping_object.vl_chi2
 
-                # Compute experimental loss
-                exp_loss_raw = np.average(models["experimental"].compute_losses()["loss"])
-                # And divide by the number of active points in this fold
+                # number of active points in this fold
                 # it would be nice to have a ndata_per_fold variable coming in the vp object...
                 ndata = np.sum([np.count_nonzero(i[k]) for i in self.experimental["folds"]])
                 # If ndata == 0 then it's the opposite, all data is in!
                 if ndata == 0:
                     ndata = self.experimental["ndata"]
+
+                # Compute experimental loss, over excluded datasets
+                exp_loss_raw = models["experimental"].compute_losses()["loss"]
                 experimental_loss = exp_loss_raw / ndata
 
-                hyper_loss = experimental_loss
-                if passed != self.pass_status:
-                    log.info("Hyperparameter combination fail to find a good fit, breaking")
-                    # If the fit failed to fit, no need to add a penalty to the loss
-                    break
-                for penalty in self.hyper_penalties:
-                    hyper_loss += penalty(pdf_model=pdf_model, stopping_object=stopping_object)
+                # Compute penalties per replica
+                penalties = {
+                    penalty.__name__: penalty(pdf_model=pdf_model, stopping_object=stopping_object)
+                    for penalty in self.hyper_penalties
+                }
+
+                # Extracting the necessary data to compute phi2
+                # First, create a list of `validphys.core.DataGroupSpec`
+                # containing only exp datasets within the held out fold
+                experimental_data = self._filter_datagroupspec(partition["datasets"])
+
+                # Compute per replica hyper losses
+                hyper_loss = self._hyper_loss.compute_loss(
+                    penalties=penalties,
+                    experimental_loss=experimental_loss,
+                    pdf_model=pdf_model,
+                    experimental_data=experimental_data,
+                    fold_idx=k,
+                )
+
                 log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
 
+                # Create another list of `validphys.core.DataGroupSpec`
+                # containing now exp datasets that are included in the training/validation dataset
+                trvl_partitions = list(self.kpartitions)
+                trvl_partitions.pop(k)
+                trvl_exp_names = [
+                    exp_name for item in trvl_partitions for exp_name in item['datasets']
+                ]
+                trvl_data = self._filter_datagroupspec(trvl_exp_names)
+                # evaluate phi2 on training/validation exp set
+                trvl_phi2 = compute_phi2(N3PDF(pdf_model.split_replicas()), trvl_data)
+
                 # Now save all information from this fold
                 l_hyper.append(hyper_loss)
                 l_valid.append(validation_loss)
                 l_exper.append(experimental_loss)
+                trvl_phi2_per_fold.append(trvl_phi2)
                 pdfs_per_fold.append(pdf_model)
                 exp_models.append(models["experimental"])
 
@@ -981,20 +1047,32 @@ def hyperparametrizable(self, params):
             # endfor
 
         if self.mode_hyperopt:
+            # turn losses into arrays
+            l_hyper = np.array(l_hyper)
+            l_valid = np.array(l_valid)
+            l_exper = np.array(l_exper)
+
+            # Compute the loss over all folds for hyperopt
+            final_hyper_loss = self._hyper_loss.reduce_over_folds(l_hyper)
+
             # Hyperopt needs a dictionary with information about the losses
             # it is possible to store arbitrary information in the trial file
             # by adding it to this dictionary
             dict_out = {
                 "status": passed,
-                "loss": self._hyper_loss(
-                    fold_losses=l_hyper, pdfs_per_fold=pdfs_per_fold, experimental_models=exp_models
-                ),
+                "loss": final_hyper_loss,
                 "validation_loss": np.average(l_valid),
                 "experimental_loss": np.average(l_exper),
                 "kfold_meta": {
                     "validation_losses": l_valid,
+                    "validation_losses_phi2": np.array(trvl_phi2_per_fold),
                     "experimental_losses": l_exper,
-                    "hyper_losses": l_hyper,
+                    "hyper_losses": np.array(self._hyper_loss.chi2_matrix),
+                    "hyper_losses_phi2": np.array(self._hyper_loss.phi2_vector),
+                    "penalties": {
+                        name: np.array(values)
+                        for name, values in self._hyper_loss.penalties.items()
+                    },
                 },
             }
             return dict_out
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 6514029ca2..9447ad9279 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -21,6 +21,7 @@
 @n3fit.checks.check_fiatlux_pdfs_id
 def performfit(
     *,
+    experiments_data,
     n3fit_checks_action,  # wrapper for all checks
     replicas,  # checks specific to performfit
     replicas_nnseed_fitting_data_dict,
@@ -74,6 +75,8 @@ def performfit(
         data: validphys.core.DataGroupSpec
             containing the datasets to be included in the fit. (Only used
             for checks)
+        experiments_data: list[validphys.core.DataGroupSpec]
+            similar to `data` but now passed as argument to `ModelTrainer`
         replicas_nnseed_fitting_data_dict: list[tuple]
             list with element for each replica (typically just one) to be
             fitted. Each element
@@ -181,6 +184,7 @@ def performfit(
         # Generate a ModelTrainer object
         # this object holds all necessary information to train a PDF (up to the NN definition)
         the_model_trainer = ModelTrainer(
+            experiments_data,
             exp_info,
             posdatasets_fitting_pos_dict,
             integdatasets_fitting_integ_dict,
@@ -194,7 +198,7 @@ def performfit(
             sum_rules=sum_rules,
             theoryid=theoryid,
             lux_params=fiatlux,
-            replica_idxs=replica_idxs,
+            replicas=replica_idxs,
         )
 
         # This is just to give a descriptive name to the fit function
diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py
index f53377b083..88569ab7eb 100644
--- a/n3fit/src/n3fit/tests/test_hyperopt.py
+++ b/n3fit/src/n3fit/tests/test_hyperopt.py
@@ -6,17 +6,99 @@
 import shutil
 import subprocess as sp
 
+import numpy as np
 from numpy.testing import assert_approx_equal
+import pytest
+
+from n3fit.hyper_optimization.rewards import HyperLoss
+from n3fit.model_gen import generate_pdf_model
+from validphys.loader import Loader
+
+
+def generate_pdf(seed, num_replicas):
+    """Generate generic pdf model."""
+    fake_fl = [
+        {"fl": i, "largex": [0, 1], "smallx": [1, 2]}
+        for i in ["u", "ubar", "d", "dbar", "c", "g", "s", "sbar"]
+    ]
+    pdf_model = generate_pdf_model(
+        nodes=[8],
+        activations=["linear"],
+        seed=seed,
+        num_replicas=num_replicas,
+        flav_info=fake_fl,
+        fitbasis="FLAVOUR",
+    )
+    return pdf_model
+
+
+def get_experimental_data(dataset_name="NMC", theoryid=399):
+    """Get experimental data set using validphys.
+
+    Returns a list defined by the data set as
+    `validphys.core.DataGroupSpec`.
+    """
+    loader = Loader()
+    ds = loader.check_dataset(dataset_name, theoryid=theoryid, cuts="internal")
+    return loader.check_experiment("My DataGroupSpec", [ds])
+
+
+@pytest.mark.parametrize(
+    "loss_type, replica_statistic, expected_per_fold_loss",
+    [
+        ("chi2", "average", 0.15),
+        ("chi2", "best_worst", 0.2),
+        ("chi2", "std", 0.05),
+        ("phi2", None, None),
+    ],
+)
+def test_compute_per_fold_loss(loss_type, replica_statistic, expected_per_fold_loss):
+    """Check that the losses per fold are calculated correctly.
+
+    This example assumes a 2 replica calculation with 3 added penalties.
+    """
+    # generate 2 replica pdf model
+    pdf_model = generate_pdf(seed=0, num_replicas=2)
+    # add 3 penalties for a 2 replica model
+    penalties = {
+        'saturation': np.array([0.0, 0.0]),
+        'patience': np.array([0.0, 0.0]),
+        'integrability': np.array([0.0, 0.0]),
+    }
+    # experimental losses for each replica
+    experimental_loss = np.array([0.1, 0.2])
+    # get experimental data to compare with
+    experimental_data = [get_experimental_data()]
+
+    loss = HyperLoss(loss_type=loss_type, replica_statistic=replica_statistic)
+
+    # calculate statistic loss for one specific fold
+    predicted_per_fold_loss = loss.compute_loss(
+        penalties, experimental_loss, pdf_model, experimental_data
+    )
+
+    # Assert
+    if expected_per_fold_loss is not None:
+        assert_approx_equal(predicted_per_fold_loss, expected_per_fold_loss)
+    else:
+        assert predicted_per_fold_loss > 0  # Test for non-negativity
+        assert predicted_per_fold_loss.dtype == np.float64  # Test its type
+        # Add more property-based tests specific to "phi2" if possible
+
+
+def test_loss_reduce_over_folds():
+    """Ensure that the hyper loss statistics over all folds are calculated correctly."""
+    # define losses for 3 folds
+    losses = np.array([1.0, 2.0, 3.0])
 
-from n3fit.hyper_optimization import rewards
+    loss_average = HyperLoss(fold_statistic="average")
+    assert_approx_equal(loss_average.reduce_over_folds(losses), 2.0)
 
+    loss_best_worst_best_worst = HyperLoss(fold_statistic="best_worst")
+    assert_approx_equal(loss_best_worst_best_worst.reduce_over_folds(losses), 3.0)
 
-def test_rewards():
-    """Ensure that rewards continue doing what they are supposed to do"""
-    losses = [0.0, 1.0, 2.0]
-    assert_approx_equal(rewards.average(losses), 1.0)
-    assert_approx_equal(rewards.best_worst(losses), 2.0)
-    assert_approx_equal(rewards.std(losses), 0.816496580927726)
+    loss_std = HyperLoss(fold_statistic="std")
+    assert_approx_equal(loss_std.reduce_over_folds(losses), 0.816496580927726)
 
 
 REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
@@ -78,7 +160,7 @@ def test_restart_from_pickle(tmp_path):
     direct_json_path = f"{output_direct}/nnfit/replica_{REPLICA}/tries.json"
     direct_json = load_data(direct_json_path)
 
-    # minimum check: the generated list of nested dictionaries have same lenght
+    # minimum check: the generated list of nested dictionaries have same length
     assert len(restart_json) == len(direct_json)
 
     for i in range(n_trials_total):
diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index 12e5aca374..453ef5738c 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -27,8 +27,10 @@
 from n3fit.backends import PREPROCESSING_LAYER_ALL_REPLICAS
 from validphys.arclength import arc_lengths, integrability_number
 from validphys.core import PDF, MCStats
+from validphys.covmats import covmat_from_systematics, sqrt_covmat
 from validphys.lhapdfset import LHAPDFSet
 from validphys.pdfbases import ALL_FLAVOURS, check_basis
+from validphys.results import abs_chi2_data, phi_data, results
 
 log = logging.getLogger(__name__)
 # Order of the evolution basis output from n3fit
@@ -331,3 +333,57 @@ def compute_arclength(self, q0=1.65, basis="evolution", flavours=None):
         flavours = ["sigma", "gluon", "V", "V3", "V8"]
     ret = arc_lengths(self, [q0], basis, flavours)
     return ret.stats.central_value()
+
+
+def compute_phi2(n3pdf, experimental_data):
+    """Compute phi2 using validphys functions.
+
+    For more info on how phi is calculated; see Eq.(4.6) of 10.1007/JHEP04(2015)040
+
+    Parameters
+    ----------
+        n3pdfs: :class:`n3fit.vpinterface.N3PDF`
+            `N3PDF` instance defining the n3fitted multi-replica PDF
+        experimental_data: List[validphys.core.DataGroupSpec]
+            List of experiment group datasets as `DataGroupSpec` instances
+
+    Returns
+    -------
+        sum_phi2: float
+            Sum of phi2 over all experimental group datasets
+
+    Example
+    -------
+    >>> from n3fit.vpinterface import N3PDF, compute_phi2
+    >>> from n3fit.model_gen import generate_pdf_model
+    >>> from validphys.loader import Loader
+    >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'g', 's', 'sbar']]
+    >>> pdf_model = generate_pdf_model(nodes=[8], activations=['linear'], seed=0, num_replicas=2, flav_info=fake_fl, fitbasis="FLAVOUR")
+    >>> n3pdf = N3PDF(pdf_model.split_replicas())
+    >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
+    >>> data_group_spec = Loader().check_experiment("My DataGroupSpec", [ds])
+    >>> phi2 = compute_phi2(n3pdf, [data_group_spec])
+    """
+    sum_phi2 = 0.0
+    # Loop over the list of `DataGroupSpec` objects
+    for datagroupspec in experimental_data:
+        # datagroupspec is an instance of `DataGroupSpec`
+
+        # Loop over `DataGroupSpec` datasets
+        for datasetspec in datagroupspec.datasets:
+            # datasetspec is an instance of `DataSetSpec`
+
+            # get covariant matrix for each `DataSetSpec`
+            covmat = covmat_from_systematics(datasetspec.load_commondata(), datasetspec)
+
+            # get experiment (`DataResult`) and theory (`ThPredictionsResult`) predictions
+            res = results(datasetspec, n3pdf, covmat, sqrt_covmat(covmat))
+
+            # calculate standard chi2 (all_chi2) and chi2 using PDF central values (central_chi2)
+            chi2 = abs_chi2_data(res)
+
+            # calculate phi and store phi**2
+            phi, _ = phi_data(chi2)
+            sum_phi2 += phi**2
+
+    return sum_phi2

From 23ac231166b3fc9fc08d66840aa021a0a70546e5 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 5 Mar 2024 07:55:29 +0100
Subject: [PATCH 02/20] Update 'rewards.py' module docstring

---
 n3fit/src/n3fit/hyper_optimization/rewards.py | 40 +++++++++++--------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index af97aa5bf7..864542fda0 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -1,28 +1,34 @@
 """
     Target functions to minimize during hyperparameter scan
 
-    Not all functions will use all arguments.
-    Keyword arguments that model_trainer.py will pass to this file are:
+    These are implemented in the HyperLoss class which incorporates
+    various statistics (average, standard deviation, best/worst case)
+    both across multiple replicas of a model and across different folds.
 
-    - fold_losses: a list with the loss of each fold
-    - pdfs_per_fold: a list of (multi replica) PDFs for each fold
-    - experimental_models: a reference to the model that contains the cv for all data (no masks)
+    Key functionalities include:
+    - Support for different loss types such as Chi-square (chi2) and phi-square (phi2).
+    - Calculation of statistical measures (average, best_worst, std) over replicas and folds.
+    - Incorporation of penalties into the loss computation.
+    - Detailed tracking and storage of loss metrics for further analysis.
 
-    New loss functions can be added directly in this module
-    the name in the runcard must match the name in the module
+    New statistics can be added directly in this class as staticmethods and
+    via :attr:`~HyperLoss.implemented_stats`; their name in the runcard must
+    match the name in the module
 
     Example
     -------
-    >>> import n3fit.hyper_optimization.rewards
-    >>> f = ["average", "best_worst", "std"]
-    >>> losses = [2.34, 1.234, 3.42]
-    >>> for fname in f:
-    >>>    fun = getattr(n3fit.hyper_optimization.rewards, fname)
-    >>>    print(f"{fname}: {fun(losses, None):2.4f}")
-    average: 2.3313
-    best_worst: 3.4200
-    std: 0.8925
-
+    >>> import numpy as np
+    >>> from n3fit.hyper_optimization.rewards import HyperLoss
+    >>> losses = np.array([1.0, 2.0, 3.0])
+    >>> loss_average = HyperLoss(fold_statistic="average")
+    >>> loss_best_worst = HyperLoss(fold_statistic="best_worst")
+    >>> loss_std = HyperLoss(fold_statistic="std")
+    >>> print(f"{loss_average.reduce_over_folds.__name__} {loss_average.reduce_over_folds(losses)}")
+    >>> print(f"{loss_best_worst.reduce_over_folds.__name__} {loss_best_worst.reduce_over_folds(losses)}")
+    >>> print(f"{loss_std.reduce_over_folds.__name__} {loss_std.reduce_over_folds(losses)}")
+    _average 2.0
+    _best_worst 3.0
+    _std 0.816496580927726
 """
 import logging
 from typing import Callable, Dict, List

From 91466ec38279d0d2b2a6940ca800f0f6caa77ad9 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 5 Mar 2024 08:04:18 +0100
Subject: [PATCH 03/20] Fix in staticmethods output type hints

---
 n3fit/src/n3fit/hyper_optimization/rewards.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index 864542fda0..f803033e02 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -272,7 +272,7 @@ def _parse_statistic(self, statistic: str, name: str) -> Callable:
             return lambda x: np.reciprocal(selected_statistic(x))
 
     @staticmethod
-    def _average(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+    def _average(fold_losses: np.ndarray, axis: int = 0) -> float:
         """
         Compute the average of the input array along the specified axis.
 
@@ -285,12 +285,12 @@ def _average(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
 
         Returns
         -------
-            np.ndarray: The average along the specified axis.
+            float: The average along the specified axis.
         """
         return np.average(fold_losses, axis=axis).item()
 
     @staticmethod
-    def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+    def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> float:
         """
         Compute the maximum value of the input array along the specified axis.
 
@@ -303,12 +303,12 @@ def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
 
         Returns
         -------
-            np.ndarray: The maximum value along the specified axis.
+            float: The maximum value along the specified axis.
         """
         return np.max(fold_losses, axis=axis).item()
 
     @staticmethod
-    def _std(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+    def _std(fold_losses: np.ndarray, axis: int = 0) -> float:
         """
         Compute the standard deviation of the input array along the specified axis.
 
@@ -321,7 +321,7 @@ def _std(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
 
         Returns
         -------
-            np.ndarray: The standard deviation along the specified axis.
+            float: The standard deviation along the specified axis.
         """
         return np.std(fold_losses, axis=axis).item()
 

From 3eeb531f759ffd1c70c69fc4dd1d16d763fd679f Mon Sep 17 00:00:00 2001
From: Carlos Murilo Romero Rocha
 <114645116+Cmurilochem@users.noreply.github.com>
Date: Tue, 5 Mar 2024 08:07:35 +0100
Subject: [PATCH 04/20] Update comment in 'vpinterface.py'

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 n3fit/src/n3fit/vpinterface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index 453ef5738c..6222fa834e 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -376,7 +376,7 @@ def compute_phi2(n3pdf, experimental_data):
             # get covariant matrix for each `DataSetSpec`
             covmat = covmat_from_systematics(datasetspec.load_commondata(), datasetspec)
 
-            # get experiment (`DataResult`) and theory (`ThPredictionsResult`) predictions
+            # get experiment info (`DataResult`) and theory predictions (`ThPredictionsResult`)
             res = results(datasetspec, n3pdf, covmat, sqrt_covmat(covmat))
 
             # calculate standard chi2 (all_chi2) and chi2 using PDF central values (central_chi2)

From 6ac3ef11b28b4ccd5935780e867fc865dadd14a5 Mon Sep 17 00:00:00 2001
From: Carlos Murilo Romero Rocha
 <114645116+Cmurilochem@users.noreply.github.com>
Date: Tue, 5 Mar 2024 08:22:46 +0100
Subject: [PATCH 05/20] Update comment in 'hyper_scan.py'

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 n3fit/src/n3fit/hyper_optimization/hyper_scan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
index 561a23283e..123ac59b3b 100644
--- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
+++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -26,7 +26,7 @@
 log = logging.getLogger(__name__)
 
 # Hyperopt uses these strings for a passed and failed run
-# it also has statusses "new", "running" and "suspended", but we don't use them
+# it also has statuses "new", "running" and "suspended", but we don't use them
 HYPEROPT_STATUSSES = {True: "ok", False: "fail"}
 
 

From 339ceff1e0ec1da92f91319bd80106a8fedfa396 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 5 Mar 2024 08:17:09 +0100
Subject: [PATCH 06/20] Update compute_phi2 return value as suggested by Roy

---
 n3fit/src/n3fit/vpinterface.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index 6222fa834e..981974cabf 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -364,7 +364,8 @@ def compute_phi2(n3pdf, experimental_data):
     >>> data_group_spec = Loader().check_experiment("My DataGroupSpec", [ds])
     >>> phi2 = compute_phi2(n3pdf, [data_group_spec])
     """
-    sum_phi2 = 0.0
+    sum_phi = 0.0
+    ndat_tot = 0
     # Loop over the list of `DataGroupSpec` objects
     for datagroupspec in experimental_data:
         # datagroupspec is an instance of `DataGroupSpec`
@@ -383,7 +384,8 @@ def compute_phi2(n3pdf, experimental_data):
             chi2 = abs_chi2_data(res)
 
             # calculate phi and store phi**2
-            phi, _ = phi_data(chi2)
-            sum_phi2 += phi**2
+            phi, ndat = phi_data(chi2)
+            sum_phi += np.sqrt(ndat) * phi
+            ndat_tot += ndat
 
-    return sum_phi2
+    return sum_phi / np.sqrt(ndat_tot)

From 682e8da337063b44026448c533f98b07957c2b32 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 5 Mar 2024 08:41:13 +0100
Subject: [PATCH 07/20] Update 'penalties' argument info and example in
 'compute_loss'

---
 n3fit/src/n3fit/hyper_optimization/rewards.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index f803033e02..3f19dca9b7 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -105,6 +105,8 @@ def compute_loss(
         ----------
             penalties: Dict[str, NDArray(replicas)]
                 Dict of penalties for each replica.
+                Possible keys are 'saturation', 'patience' and 'integrability'
+                as defined in 'penalties.py' and instantiated within :class:`~n3fit.model_trainer.ModelTrainer`.
             experimental_loss: NDArray(replicas)
                 Experimental loss for each replica.
             pdf_model: :class:`n3fit.backends.MetaModel`
@@ -126,7 +128,7 @@ def compute_loss(
         >>> from n3fit.model_gen import generate_pdf_model
         >>> from validphys.loader import Loader
         >>> hyper = HyperLoss(loss_type="chi2", replica_statistic="average", fold_statistic="average")
-        >>> penalties = {'saturation': np.array([1.0, 5.0])}
+        >>> penalties = {'saturation': np.array([1.0, 2.0]), 'patience': np.array([3.0, 4.0]), 'integrability': np.array([5.0, 6.0]),}
         >>> experimental_loss = np.array([0.1, 0.2])
         >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
         >>> experimental_data = [Loader().check_experiment("My DataGroupSpec", [ds])]

From 16cc5bf81e2febf8d59fdd97a8d7618af76c1f14 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Tue, 5 Mar 2024 17:01:27 +0100
Subject: [PATCH 08/20] Changed 'compute_phi2' to 'compute_phi'

---
 n3fit/src/n3fit/hyper_optimization/rewards.py |  4 ++--
 n3fit/src/n3fit/model_trainer.py              |  4 ++--
 n3fit/src/n3fit/vpinterface.py                | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index 3f19dca9b7..fcaf2846b9 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -36,7 +36,7 @@
 import numpy as np
 
 from n3fit.backends import MetaModel
-from n3fit.vpinterface import N3PDF, compute_phi2
+from n3fit.vpinterface import N3PDF, compute_phi
 from validphys.core import DataGroupSpec
 from validphys.pdfgrids import distance_grids, xplotting_grid
 
@@ -137,7 +137,7 @@ def compute_loss(
         >>> loss = hyper.compute_loss(penalties, experimental_loss, pdf_model, experimental_data)
         """
         # calculate phi2 for a given k-fold using vpinterface and validphys
-        phi2_per_fold = compute_phi2(N3PDF(pdf_model.split_replicas()), experimental_data)
+        phi2_per_fold = compute_phi(N3PDF(pdf_model.split_replicas()), experimental_data)
 
         # update hyperopt metrics
         # these are saved in the phi2_vector and chi2_matrix attributes, excluding penalties
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 8db9acfaec..1b4554f410 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -22,7 +22,7 @@
 from n3fit.hyper_optimization.rewards import HyperLoss
 from n3fit.scaler import generate_scaler
 from n3fit.stopping import Stopping
-from n3fit.vpinterface import N3PDF, compute_phi2
+from n3fit.vpinterface import N3PDF, compute_phi
 from validphys.core import DataGroupSpec
 from validphys.photon.compute import Photon
 
@@ -1023,7 +1023,7 @@ def hyperparametrizable(self, params):
                 ]
                 trvl_data = self._filter_datagroupspec(trvl_exp_names)
                 # evaluate phi2 on training/validation exp set
-                trvl_phi2 = compute_phi2(N3PDF(pdf_model.split_replicas()), trvl_data)
+                trvl_phi2 = compute_phi(N3PDF(pdf_model.split_replicas()), trvl_data)
 
                 # Now save all information from this fold
                 l_hyper.append(hyper_loss)
diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index 981974cabf..c564941f7d 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -335,8 +335,8 @@ def compute_arclength(self, q0=1.65, basis="evolution", flavours=None):
     return ret.stats.central_value()
 
 
-def compute_phi2(n3pdf, experimental_data):
-    """Compute phi2 using validphys functions.
+def compute_phi(n3pdf, experimental_data):
+    """Compute phi using validphys functions.
 
     For more info on how phi is calculated; see Eq.(4.6) of 10.1007/JHEP04(2015)040
 
@@ -349,12 +349,12 @@ def compute_phi2(n3pdf, experimental_data):
 
     Returns
     -------
-        sum_phi2: float
-            Sum of phi2 over all experimental group datasets
+        sum_phi: float
+            Sum of phi over all experimental group datasets
 
     Example
     -------
-    >>> from n3fit.vpinterface import N3PDF, compute_phi2
+    >>> from n3fit.vpinterface import N3PDF, compute_phi
     >>> from n3fit.model_gen import generate_pdf_model
     >>> from validphys.loader import Loader
     >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'g', 's', 'sbar']]
@@ -362,7 +362,7 @@ def compute_phi2(n3pdf, experimental_data):
     >>> n3pdf = N3PDF(pdf_model.split_replicas())
     >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
     >>> data_group_spec = Loader().check_experiment("My DataGroupSpec", [ds])
-    >>> phi2 = compute_phi2(n3pdf, [data_group_spec])
+    >>> phi = compute_phi(n3pdf, [data_group_spec])
     """
     sum_phi = 0.0
     ndat_tot = 0

From cd95bf50951ca3fbd687a1e4fded965fc85f7301 Mon Sep 17 00:00:00 2001
From: Aron <aronpjansen@gmail.com>
Date: Wed, 6 Mar 2024 10:29:18 +0100
Subject: [PATCH 09/20] Get rid of _status_wrapper

---
 .../src/n3fit/hyper_optimization/hyper_scan.py  | 17 ++---------------
 n3fit/src/n3fit/model_trainer.py                |  3 ++-
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
index 123ac59b3b..bccb67bd5f 100644
--- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
+++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -27,7 +27,7 @@
 
 # Hyperopt uses these strings for a passed and failed run
 # it also has statuses "new", "running" and "suspended", but we don't use them
-HYPEROPT_STATUSSES = {True: "ok", False: "fail"}
+HYPEROPT_STATUSES = {True: "ok", False: "fail"}
 
 
 HYPEROPT_SEED = 42
@@ -138,7 +138,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
 
     # Perform the scan
     best = hyperopt.fmin(
-        fn=_status_wrapper(model_trainer.hyperparametrizable),
+        fn=model_trainer.hyperparametrizable,
         space=hyperscanner.as_dict(),
         algo=hyperopt.tpe.suggest,
         max_evals=max_evals,
@@ -150,19 +150,6 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     return hyperscanner.space_eval(best)
 
 
-def _status_wrapper(hyperparametrizable: Callable) -> Callable:
-    """
-    Wrapper that just converts the "status" value to hyperopt's conventions.
-    """
-
-    def wrapped(*args, **kwargs):
-        results_dict = hyperparametrizable(*args, **kwargs)
-        results_dict["status"] = HYPEROPT_STATUSSES[results_dict["status"]]
-        return results_dict
-
-    return wrapped
-
-
 class ActivationStr:
     """
     Upon call this class returns an array where the activation function
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 1b4554f410..78691ebf13 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -17,6 +17,7 @@
 from n3fit import model_gen
 from n3fit.backends import NN_LAYER_ALL_REPLICAS, MetaModel, callbacks, clear_backend_state
 from n3fit.backends import operations as op
+from n3fit.hyper_optimization.hyper_scan import HYPEROPT_STATUSES
 import n3fit.hyper_optimization.penalties
 import n3fit.hyper_optimization.rewards
 from n3fit.hyper_optimization.rewards import HyperLoss
@@ -1059,7 +1060,7 @@ def hyperparametrizable(self, params):
             # it is possible to store arbitrary information in the trial file
             # by adding it to this dictionary
             dict_out = {
-                "status": passed,
+                "status": HYPEROPT_STATUSES[passed],
                 "loss": final_hyper_loss,
                 "validation_loss": np.average(l_valid),
                 "experimental_loss": np.average(l_exper),

From 7a06180b37b2d756c64263dbb53e5487f9427ecd Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Wed, 6 Mar 2024 13:25:19 +0100
Subject: [PATCH 10/20] Remove phi2 from names

---
 n3fit/src/n3fit/hyper_optimization/rewards.py | 32 +++++++++----------
 n3fit/src/n3fit/model_trainer.py              | 16 +++++-----
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index fcaf2846b9..dfb38bc44f 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -85,7 +85,7 @@ def __init__(
         self.reduce_over_replicas = self._parse_statistic(replica_statistic, "replica_statistic")
         self.reduce_over_folds = self._parse_statistic(fold_statistic, "fold_statistic")
 
-        self.phi2_vector = []
+        self.phi_vector = []
         self.chi2_matrix = []
 
         self.penalties = {}
@@ -136,43 +136,43 @@ def compute_loss(
         >>> pdf_model = generate_pdf_model(nodes=[8], activations=['linear'], seed=0, num_replicas=2, flav_info=fake_fl, fitbasis="FLAVOUR")
         >>> loss = hyper.compute_loss(penalties, experimental_loss, pdf_model, experimental_data)
         """
-        # calculate phi2 for a given k-fold using vpinterface and validphys
-        phi2_per_fold = compute_phi(N3PDF(pdf_model.split_replicas()), experimental_data)
+        # calculate phi for a given k-fold using vpinterface and validphys
+        phi_per_fold = compute_phi(N3PDF(pdf_model.split_replicas()), experimental_data)
 
         # update hyperopt metrics
-        # these are saved in the phi2_vector and chi2_matrix attributes, excluding penalties
-        self._save_hyperopt_metrics(phi2_per_fold, experimental_loss, penalties, fold_idx)
+        # these are saved in the phi_vector and chi2_matrix attributes, excluding penalties
+        self._save_hyperopt_metrics(phi_per_fold, experimental_loss, penalties, fold_idx)
 
         # include penalties to experimental loss
         # this allows introduction of statistics also to penalties
         experimental_loss_w_penalties = experimental_loss + sum(penalties.values())
 
-        # add penalties to phi2 in the form of a sum of per-replicas averages
-        phi2_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
+        # add penalties to phi in the form of a sum of per-replicas averages
+        phi_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
 
         # define loss for hyperopt according to the chosen loss_type
         if self.loss_type == "chi2":
             # calculate statistics of chi2 over replicas for a given k-fold
             loss = self.reduce_over_replicas(experimental_loss_w_penalties)
         elif self.loss_type == "phi2":
-            loss = phi2_per_fold
+            loss = phi_per_fold**2
 
         return loss
 
     def _save_hyperopt_metrics(
         self,
-        phi2_per_fold: float,
+        phi_per_fold: float,
         chi2_per_fold: np.ndarray,
         penalties: Dict[str, np.ndarray],
         fold_idx: int = 0,
     ) -> None:
         """
-        Save all chi2 and phi2 calculated metrics per replica and per fold, including penalties.
+        Save all chi2 and phi calculated metrics per replica and per fold, including penalties.
 
         Parameters
         ----------
-            phi2_per_fold: float
-                Computed phi2 for a given k-fold
+            phi_per_fold: float
+                Computed phi for a given k-fold
             chi2_per_fold: np.ndarray
                 Computed chi2 for each replica for a given k-fold
             penalties: Dict[str, np.ndarray]
@@ -180,15 +180,15 @@ def _save_hyperopt_metrics(
             fold_idx: int
                 k-fold index. Defaults to 0.
         """
-        # reset chi2 and phi2 arrays for every trial
+        # reset chi2 and phi arrays for every trial
         if fold_idx == 0:
-            self.phi2_vector = []
+            self.phi_vector = []
             self.chi2_matrix = []
             self.penalties = {}
 
-        # populate chi2 matrix and phi2 vector calculated for a given k-fold
+        # populate chi2 matrix and phi vector calculated for a given k-fold
         self.chi2_matrix.append(chi2_per_fold)
-        self.phi2_vector.append(phi2_per_fold)
+        self.phi_vector.append(phi_per_fold)
 
         # save penalties per replica for a given k-fold
         for name, values in penalties.items():
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 78691ebf13..3dd9c14779 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -890,8 +890,8 @@ def hyperparametrizable(self, params):
         # And lists to save hyperopt utilities
         pdfs_per_fold = []
         exp_models = []
-        # phi2 evaluated over training/validation exp data
-        trvl_phi2_per_fold = []
+        # phi evaluated over training/validation exp data
+        trvl_phi_per_fold = []
 
         # Generate the grid in x, note this is the same for all partitions
         xinput = self._xgrid_generation()
@@ -999,7 +999,7 @@ def hyperparametrizable(self, params):
                     for penalty in self.hyper_penalties
                 }
 
-                # Extracting the necessary data to compute phi2
+                # Extracting the necessary data to compute phi
                 # First, create a list of `validphys.core.DataGroupSpec`
                 # containing only exp datasets within the held out fold
                 experimental_data = self._filter_datagroupspec(partition["datasets"])
@@ -1023,14 +1023,14 @@ def hyperparametrizable(self, params):
                     exp_name for item in trvl_partitions for exp_name in item['datasets']
                 ]
                 trvl_data = self._filter_datagroupspec(trvl_exp_names)
-                # evaluate phi2 on training/validation exp set
-                trvl_phi2 = compute_phi(N3PDF(pdf_model.split_replicas()), trvl_data)
+                # evaluate phi on training/validation exp set
+                trvl_phi = compute_phi(N3PDF(pdf_model.split_replicas()), trvl_data)
 
                 # Now save all information from this fold
                 l_hyper.append(hyper_loss)
                 l_valid.append(validation_loss)
                 l_exper.append(experimental_loss)
-                trvl_phi2_per_fold.append(trvl_phi2)
+                trvl_phi_per_fold.append(trvl_phi)
                 pdfs_per_fold.append(pdf_model)
                 exp_models.append(models["experimental"])
 
@@ -1066,10 +1066,10 @@ def hyperparametrizable(self, params):
                 "experimental_loss": np.average(l_exper),
                 "kfold_meta": {
                     "validation_losses": l_valid,
-                    "validation_losses_phi2": np.array(trvl_phi2_per_fold),
+                    "validation_losses_phi": np.array(trvl_phi_per_fold),
                     "experimental_losses": l_exper,
                     "hyper_losses": np.array(self._hyper_loss.chi2_matrix),
-                    "hyper_losses_phi2": np.array(self._hyper_loss.phi2_vector),
+                    "hyper_losses_phi": np.array(self._hyper_loss.phi_vector),
                     "penalties": {
                         name: np.array(values)
                         for name, values in self._hyper_loss.penalties.items()

From ce31372fcb767bd9457551e34127b43f203bccc3 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Wed, 6 Mar 2024 16:04:42 +0100
Subject: [PATCH 11/20] Update dataset name in test

---
 n3fit/src/n3fit/tests/test_hyperopt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py
index 88569ab7eb..6e79dfe7bf 100644
--- a/n3fit/src/n3fit/tests/test_hyperopt.py
+++ b/n3fit/src/n3fit/tests/test_hyperopt.py
@@ -32,14 +32,14 @@ def generate_pdf(seed, num_replicas):
     return pdf_model
 
 
-def get_experimental_data(dataset_name="NMC", theoryid=399):
+def get_experimental_data(dataset_name="NMC_NC_NOTFIXED_P_EM-SIGMARED", theoryid=399):
     """Get experimental data set using validphys.
 
     Returns a list defined by the data set as
     `validphys.core.DataGroupSpec`.
     """
     loader = Loader()
-    ds = loader.check_dataset(dataset_name, theoryid=theoryid, cuts="internal")
+    ds = loader.check_dataset(dataset_name, theoryid=theoryid, cuts="internal", variant="legacy")
     return loader.check_experiment("My DataGroupSpec", [ds])
 
 

From fda8e3c3b340cc2d7f1c0ed403bb3d1426c7e0f7 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Wed, 6 Mar 2024 16:11:22 +0100
Subject: [PATCH 12/20] Change 'validation_losses_phi' dictionary key to
 'trvl_losses_phi'

---
 n3fit/src/n3fit/model_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 3dd9c14779..00562f25a3 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -1066,7 +1066,7 @@ def hyperparametrizable(self, params):
                 "experimental_loss": np.average(l_exper),
                 "kfold_meta": {
                     "validation_losses": l_valid,
-                    "validation_losses_phi": np.array(trvl_phi_per_fold),
+                    "trvl_losses_phi": np.array(trvl_phi_per_fold),
                     "experimental_losses": l_exper,
                     "hyper_losses": np.array(self._hyper_loss.chi2_matrix),
                     "hyper_losses_phi": np.array(self._hyper_loss.phi_vector),

From d53cee494e9cc02fb1ff862871a4b758dad615c9 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Wed, 6 Mar 2024 21:50:24 +0100
Subject: [PATCH 13/20] Add docs for the different losses/statistics

---
 doc/sphinx/source/n3fit/hyperopt.rst | 94 ++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index 7eb5ae2343..9e8b76a8f8 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -34,6 +34,8 @@ The desired features of this figure of merit can be summarized as:
 3. Be reliable even when the number of points is not very large.
 
 
+.. _hyperkfolding-label:
+
 K-folding cross-validation
 --------------------------
 A good compromise between all previous points is the usage of the cross-validation technique
@@ -320,6 +322,10 @@ Changing the hyperoptimization target
 -----------------------------------
 
 Beyond the usual :math:`\chi2`-based optimization figures above, it is possible to utilize other measures as the target for hyperoptimization.
+
+Future tests
+~~~~~~~~~~~~
+
 One possibility is to use a :ref:`future test<futuretests>`-based metric for which the goal is not to get the minimum :math:`\chi2` but to get the same :math:`\chi2` (with PDF errors considered) for different datasets. The idea is that this way we select models of which the prediction is stable upon variations in the dataset.
 In order to obtain the PDF errors used in the figure of merit it is necessary to run multiple replicas, luckily ``n3fit`` provides such a possibility also during hyperoptimization.
 
@@ -372,6 +378,94 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco
    L_{\rm hyperopt} = \chi^{2}_{(1) \rm pdferr} - \chi^{2}_{(2)}
 
 
+New hyperoptimization metrics with fold and replica statistics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The combination of :ref:`k-folding <hyperkfolding-label>` and multi-replica experiments
+opens several the possibilities for the choice of figure of merit. The simplest option would be to minimize
+the average of :math:`\chi2` across both replica and k folds, *i.e.*,
+
+.. math::
+    L_{1} = \frac{1}{n_{\rm fold}} \sum_{k=1}^{n_{\rm fold}} \left< \chi^2_{k} \right>_{\rm rep}.
+
+In NNPDF, this hyperoptimisation metrics is selected via the following generic runcard:
+
+.. code-block:: yaml
+
+        dataset_inputs:
+        ...
+
+        kfold:
+          loss_type: chi2
+          replica_statistic: average
+          fold_statistic: average
+          partitions:
+          - datasets:
+            ...
+          - datasets:
+            ...
+
+        parallel_models: true
+        same_trvl_per_replica: true
+
+By combining the ``average``, ``best_worst``, and ``std`` figures of merit discussed in :ref:`hyperkfolding-label`,
+several alternatives may arise. For example, one approach could involve minimizing
+the maximum value of the set of averaged-over-replicas :math:`\chi2`,
+
+.. math::
+    L_{2} = {\rm max} \left ( \left< \chi^2_{1} \right>_{\rm rep}, \left< \chi^2_{2} \right>_{\rm rep}, ..., \left< \chi^2_{n_{\rm fold}} \right>_{\rm rep}\right),
+
+with correspond runcard ``kfold`` settings:
+
+.. code-block:: yaml
+
+        dataset_inputs:
+        ...
+
+        kfold:
+          loss_type: chi2
+          replica_statistic: average
+          fold_statistic: best_worst
+          partitions:
+          - datasets:
+            ...
+          - datasets:
+            ...
+
+An alternative metric that is
+sensitive to higher moments of the probability distribution
+has been defined in `NNPDF3.0 <https://link.springer.com/article/10.1007/JHEP04(2015)040>`_ [see Eq. (4.6) therein],
+namely, the :math:`\varphi` estimator. In the context of hyperopt, :math:`\varphi^{2}` can be calculated for each k-fold as
+
+.. math::
+  \varphi_{k}^2 = \langle \chi^2_k  [ \mathcal{T}[f_{\rm fit}], \mathcal{D} ] \rangle_{\rm rep} - \chi^2_k [ \langle \mathcal{T}[f_{\rm fit}] \rangle_{\rm rep}, \mathcal{D} ],
+
+where the first term represents the usual averaged-over-replicas :math:`\left< \chi^2_{k} \right>_{\rm rep}`
+calculated based on the dataset used in the fit (:math:`\mathcal{D}`) and
+the theory predictions from each fitted PDF (:math:`f_{\rm fit}`) replica.
+The second term involves the calculation of :math:`\chi2` but now with respect to the theory predictions from the central PDF.
+
+On the basis of :math:`\varphi`, we define the loss as
+
+.. math::
+    L_{3} = \left (\frac{1}{n_{\rm fold}} \sum_{k=1}^{n_{\rm fold}} \varphi_{k}^2 \right)^{-1},
+
+which selects hyperparameters that favor the most conservative extrapolation.
+In NNPDF, this figure of merit is chosen using the following settings:
+
+.. code-block:: yaml
+
+        kfold:
+          loss_type: phi2
+          fold_statistic: average
+            ...
+
+Alternatively, it is currently also possible to combine :math:`L_1` (which is only sensitive to the first moment)
+with :math:`L_3` (which provides information on the second moment).
+For example, one might minimize :math:`L_1` while monitoring the values of :math:`L_3` for a final model selection.
+The optimal approach for this combination is still under development.
+All the above options are implemented in the :class:`~n3fit.hyper_optimization.rewards.HyperLoss` class
+which is instantiated and monitored within :meth:`~n3fit.model_trainer.ModelTrainer.hyperparametrizable` method.
+
 Restarting hyperoptimization runs
 ---------------------------------
 

From 99df0bf41d4ead71aa3108178fe82c90bd243b8e Mon Sep 17 00:00:00 2001
From: Aron Jansen <APJansen@users.noreply.github.com>
Date: Thu, 7 Mar 2024 12:53:05 +0100
Subject: [PATCH 14/20] Update doc/sphinx/source/n3fit/hyperopt.rst

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 doc/sphinx/source/n3fit/hyperopt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index 9e8b76a8f8..093847413f 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -381,7 +381,7 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco
 New hyperoptimization metrics with fold and replica statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The combination of :ref:`k-folding <hyperkfolding-label>` and multi-replica experiments
-opens several the possibilities for the choice of figure of merit. The simplest option would be to minimize
+opens several possibilities for the choice of figure of merit. The simplest option would be to minimize
 the average of :math:`\chi2` across both replica and k folds, *i.e.*,
 
 .. math::

From 1267b0f1ae2b0ffb6a20c85160bff62466a3dfbd Mon Sep 17 00:00:00 2001
From: Aron Jansen <APJansen@users.noreply.github.com>
Date: Thu, 7 Mar 2024 12:53:11 +0100
Subject: [PATCH 15/20] Update doc/sphinx/source/n3fit/hyperopt.rst

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 doc/sphinx/source/n3fit/hyperopt.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index 093847413f..49e1b108b8 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -405,7 +405,6 @@ In NNPDF, this hyperoptimisation metrics is selected via the following generic r
             ...
 
         parallel_models: true
-        same_trvl_per_replica: true
 
 By combining the ``average``, ``best_worst``, and ``std`` figures of merit discussed in :ref:`hyperkfolding-label`,
 several alternatives may arise. For example, one approach could involve minimizing

From d77cbea41705f6882e8e0b2f084035426dbfc5e7 Mon Sep 17 00:00:00 2001
From: Aron Jansen <APJansen@users.noreply.github.com>
Date: Thu, 7 Mar 2024 12:53:42 +0100
Subject: [PATCH 16/20] Update doc/sphinx/source/n3fit/hyperopt.rst

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 doc/sphinx/source/n3fit/hyperopt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index 49e1b108b8..e06784f278 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -382,7 +382,7 @@ New hyperoptimization metrics with fold and replica statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The combination of :ref:`k-folding <hyperkfolding-label>` and multi-replica experiments
 opens several possibilities for the choice of figure of merit. The simplest option would be to minimize
-the average of :math:`\chi2` across both replica and k folds, *i.e.*,
+the average of :math:`\chi^2` across both replica and k folds, *i.e.*,
 
 .. math::
     L_{1} = \frac{1}{n_{\rm fold}} \sum_{k=1}^{n_{\rm fold}} \left< \chi^2_{k} \right>_{\rm rep}.

From 2050911b87ce74d72fd101a6fcd2bff59d797c56 Mon Sep 17 00:00:00 2001
From: Cmurilochem <carlosmurilorocha@gmail.com>
Date: Thu, 7 Mar 2024 14:06:41 +0100
Subject: [PATCH 17/20] Add final Roy's suggestions to docs

---
 doc/sphinx/source/n3fit/hyperopt.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index e06784f278..c8966511f6 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -380,6 +380,10 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco
 
 New hyperoptimization metrics with fold and replica statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The hyperopt measures discussed above are all based on performing a single replica fit per fold.
+However one may also wish to run the hyperoptimization algorithm on fits consisting of many
+replicas per fold. This is a feasible option in ``n3fit``, since it has been optimised to
+efficiently run many replica fits in parallel on GPU.
 The combination of :ref:`k-folding <hyperkfolding-label>` and multi-replica experiments
 opens several possibilities for the choice of figure of merit. The simplest option would be to minimize
 the average of :math:`\chi^2` across both replica and k folds, *i.e.*,
@@ -408,7 +412,7 @@ In NNPDF, this hyperoptimisation metrics is selected via the following generic r
 
 By combining the ``average``, ``best_worst``, and ``std`` figures of merit discussed in :ref:`hyperkfolding-label`,
 several alternatives may arise. For example, one approach could involve minimizing
-the maximum value of the set of averaged-over-replicas :math:`\chi2`,
+the maximum value of the set of averaged-over-replicas :math:`\chi^2`,
 
 .. math::
     L_{2} = {\rm max} \left ( \left< \chi^2_{1} \right>_{\rm rep}, \left< \chi^2_{2} \right>_{\rm rep}, ..., \left< \chi^2_{n_{\rm fold}} \right>_{\rm rep}\right),

From 8a6312b56ae0d8a6313ac7d4a0a662d7665db4b8 Mon Sep 17 00:00:00 2001
From: Carlos Murilo Romero Rocha
 <114645116+Cmurilochem@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:29:25 +0100
Subject: [PATCH 18/20] Update doc/sphinx/source/n3fit/hyperopt.rst

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 doc/sphinx/source/n3fit/hyperopt.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index c8966511f6..90b922aa79 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -384,6 +384,7 @@ The hyperopt measures discussed above are all based on performing a single repli
 However one may also wish to run the hyperoptimization algorithm on fits consisting of many
 replicas per fold. This is a feasible option in ``n3fit``, since it has been optimised to
 efficiently run many replica fits in parallel on GPU.
+
 The combination of :ref:`k-folding <hyperkfolding-label>` and multi-replica experiments
 opens several possibilities for the choice of figure of merit. The simplest option would be to minimize
 the average of :math:`\chi^2` across both replica and k folds, *i.e.*,

From e996b95bef9e41c17cd676750d1d16530b9287a1 Mon Sep 17 00:00:00 2001
From: Carlos Murilo Romero Rocha
 <114645116+Cmurilochem@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:29:48 +0100
Subject: [PATCH 19/20] Update doc/sphinx/source/n3fit/hyperopt.rst

Co-authored-by: Roy Stegeman <roystegeman@live.nl>
---
 doc/sphinx/source/n3fit/hyperopt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
index 90b922aa79..83c302216d 100644
--- a/doc/sphinx/source/n3fit/hyperopt.rst
+++ b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -381,7 +381,7 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco
 New hyperoptimization metrics with fold and replica statistics
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The hyperopt measures discussed above are all based on performing a single replica fit per fold.
-However one may also wish to run the hyperoptimization algorithm on fits consisting of many
+However, one may also wish to run the hyperoptimization algorithm on fits consisting of many
 replicas per fold. This is a feasible option in ``n3fit``, since it has been optimised to
 efficiently run many replica fits in parallel on GPU.
 

From 33b6d9729878ba99d0343eb09451078475ed8a25 Mon Sep 17 00:00:00 2001
From: Aron <aronpjansen@gmail.com>
Date: Thu, 7 Mar 2024 16:18:39 +0100
Subject: [PATCH 20/20] Avoid initialization of HyperLoss in checks

---
 n3fit/src/n3fit/checks.py                     |   8 +-
 n3fit/src/n3fit/hyper_optimization/rewards.py | 131 +++++++++---------
 2 files changed, 68 insertions(+), 71 deletions(-)

diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index 0b5416e2d9..6bf0ad19c7 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -9,7 +9,7 @@
 
 from n3fit.hyper_optimization import penalties as penalties_module
 from n3fit.hyper_optimization import rewards as rewards_module
-from n3fit.hyper_optimization.rewards import HyperLoss
+from n3fit.hyper_optimization.rewards import IMPLEMENTED_LOSSES, IMPLEMENTED_STATS
 from reportengine.checks import CheckError, make_argcheck
 from validphys.core import PDF
 from validphys.pdfbases import check_basis
@@ -259,7 +259,7 @@ def check_kfold_options(kfold):
 
     loss_type = kfold.get("loss_type")
     if loss_type is not None:
-        if loss_type not in HyperLoss().implemented_losses:
+        if loss_type not in IMPLEMENTED_LOSSES:
             raise CheckError(
                 f"Loss type '{loss_type}' is not recognized, "
                 "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py."
@@ -267,14 +267,14 @@ def check_kfold_options(kfold):
             )
     replica_statistic = kfold.get("replica_statistic")
     if replica_statistic is not None:
-        if replica_statistic not in HyperLoss().implemented_stats:
+        if replica_statistic not in IMPLEMENTED_STATS:
             raise CheckError(
                 f"The replica statistic '{replica_statistic}' is not recognized, "
                 "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
             )
     fold_statistic = kfold.get("fold_statistic")
     if fold_statistic is not None:
-        if fold_statistic not in HyperLoss().implemented_stats:
+        if fold_statistic not in IMPLEMENTED_STATS:
             raise CheckError(
                 f"The fold statistic '{fold_statistic}' is not recognized, "
                 "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index dfb38bc44f..86c64eaab7 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -12,7 +12,7 @@
     - Detailed tracking and storage of loss metrics for further analysis.
 
     New statistics can be added directly in this class as staticmethods and
-    via :attr:`~HyperLoss.implemented_stats`; their name in the runcard must
+    via `IMPLEMENTED_STATS`; their name in the runcard must
     match the name in the module
 
     Example
@@ -43,6 +43,64 @@
 log = logging.getLogger(__name__)
 
 
+def _average(fold_losses: np.ndarray, axis: int = 0) -> float:
+    """
+    Compute the average of the input array along the specified axis.
+
+    Parameters
+    ----------
+        fold_losses: np.ndarray
+            Input array.
+        axis: int, optional
+            Axis along which the mean is computed. Default is 0.
+
+    Returns
+    -------
+        float: The average along the specified axis.
+    """
+    return np.average(fold_losses, axis=axis).item()
+
+
+def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> float:
+    """
+    Compute the maximum value of the input array along the specified axis.
+
+    Parameters
+    ----------
+        fold_losses: np.ndarray
+            Input array.
+        axis: int, optional
+            Axis along which the maximum is computed. Default is 0.
+
+    Returns
+    -------
+        float: The maximum value along the specified axis.
+    """
+    return np.max(fold_losses, axis=axis).item()
+
+
+def _std(fold_losses: np.ndarray, axis: int = 0) -> float:
+    """
+    Compute the standard deviation of the input array along the specified axis.
+
+    Parameters
+    ----------
+        fold_losses: np.ndarray
+            Input array.
+        axis: int, optional
+            Axis along which the standard deviation is computed. Default is 0.
+
+    Returns
+    -------
+        float: The standard deviation along the specified axis.
+    """
+    return np.std(fold_losses, axis=axis).item()
+
+
+IMPLEMENTED_STATS = {"average": _average, "best_worst": _best_worst, "std": _std}
+IMPLEMENTED_LOSSES = ["chi2", "phi2"]
+
+
 def _pdfs_to_n3pdfs(pdfs_per_fold):
     """Convert a list of multi-replica PDFs to a list of N3PDFs"""
     return [N3PDF(pdf.split_replicas(), name=f"fold_{k}") for k, pdf in enumerate(pdfs_per_fold)]
@@ -71,13 +129,6 @@ class HyperLoss:
     def __init__(
         self, loss_type: str = None, replica_statistic: str = None, fold_statistic: str = None
     ):
-        self.implemented_stats = {
-            "average": self._average,
-            "best_worst": self._best_worst,
-            "std": self._std,
-        }
-        self.implemented_losses = ["chi2", "phi2"]
-
         self._default_statistic = "average"
         self._default_loss = "chi2"
 
@@ -218,8 +269,8 @@ def _parse_loss(self, loss_type: str) -> str:
             loss_type = self._default_loss
             log.warning(f"No loss_type selected in HyperLoss, defaulting to {loss_type}")
         else:
-            if loss_type not in self.implemented_losses:
-                valid_options = ", ".join(self.implemented_losses)
+            if loss_type not in IMPLEMENTED_LOSSES:
+                valid_options = ", ".join(IMPLEMENTED_LOSSES)
                 raise ValueError(
                     f"Invalid loss type '{loss_type}'. Valid options are: {valid_options}"
                 )
@@ -255,15 +306,15 @@ def _parse_statistic(self, statistic: str, name: str) -> Callable:
             statistic = self._default_statistic
             log.warning(f"No {name} selected in HyperLoss, defaulting to {statistic}")
         else:
-            if statistic not in self.implemented_stats:
-                valid_options = ", ".join(self.implemented_stats.keys())
+            if statistic not in IMPLEMENTED_STATS:
+                valid_options = ", ".join(IMPLEMENTED_STATS.keys())
                 raise ValueError(
                     f"Invalid {name} '{statistic}'. Valid options are: {valid_options}"
                 )
 
         log.info(f"Using '{statistic}' as the {name} for hyperoptimization")
 
-        selected_statistic = self.implemented_stats[statistic]
+        selected_statistic = IMPLEMENTED_STATS[statistic]
 
         if self.loss_type == "chi2":
             return selected_statistic
@@ -273,60 +324,6 @@ def _parse_statistic(self, statistic: str, name: str) -> Callable:
             # This is only used when calculating statistics over folds
             return lambda x: np.reciprocal(selected_statistic(x))
 
-    @staticmethod
-    def _average(fold_losses: np.ndarray, axis: int = 0) -> float:
-        """
-        Compute the average of the input array along the specified axis.
-
-        Parameters
-        ----------
-            fold_losses: np.ndarray
-                Input array.
-            axis: int, optional
-                Axis along which the mean is computed. Default is 0.
-
-        Returns
-        -------
-            float: The average along the specified axis.
-        """
-        return np.average(fold_losses, axis=axis).item()
-
-    @staticmethod
-    def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> float:
-        """
-        Compute the maximum value of the input array along the specified axis.
-
-        Parameters
-        ----------
-            fold_losses: np.ndarray
-                Input array.
-            axis: int, optional
-                Axis along which the maximum is computed. Default is 0.
-
-        Returns
-        -------
-            float: The maximum value along the specified axis.
-        """
-        return np.max(fold_losses, axis=axis).item()
-
-    @staticmethod
-    def _std(fold_losses: np.ndarray, axis: int = 0) -> float:
-        """
-        Compute the standard deviation of the input array along the specified axis.
-
-        Parameters
-        ----------
-            fold_losses: np.ndarray
-                Input array.
-            axis: int, optional
-                Axis along which the standard deviation is computed. Default is 0.
-
-        Returns
-        -------
-            float: The standard deviation along the specified axis.
-        """
-        return np.std(fold_losses, axis=axis).item()
-
 
 def fit_distance(pdfs_per_fold=None, **_kwargs):
     """Loss function for hyperoptimization based on the distance of