diff --git a/n3fit/runcards/examples/Basic_hyperopt.yml b/n3fit/runcards/examples/Basic_hyperopt.yml
index 6b5393a870..1bff746fab 100644
--- a/n3fit/runcards/examples/Basic_hyperopt.yml
+++ b/n3fit/runcards/examples/Basic_hyperopt.yml
@@ -81,7 +81,7 @@ hyperscan_config:
       activations: ['sigmoid', 'tanh']
 
 kfold:
-  target: average
+  fold_statistic: average
   penalties:
     - saturation
     - patience
diff --git a/n3fit/runcards/reproduce_nnpdf40/NNPDF40_nnlo_as_01180_1000.yml b/n3fit/runcards/reproduce_nnpdf40/NNPDF40_nnlo_as_01180_1000.yml
index da30fc1b2e..4e2295470f 100644
--- a/n3fit/runcards/reproduce_nnpdf40/NNPDF40_nnlo_as_01180_1000.yml
+++ b/n3fit/runcards/reproduce_nnpdf40/NNPDF40_nnlo_as_01180_1000.yml
@@ -163,4 +163,4 @@ integrability:
 
 ############################################################
 debug: false
-maxcores: 4
+maxcores: 4
\ No newline at end of file
diff --git a/n3fit/src/n3fit/checks.py b/n3fit/src/n3fit/checks.py
index e1df9e2865..9d28e8110e 100644
--- a/n3fit/src/n3fit/checks.py
+++ b/n3fit/src/n3fit/checks.py
@@ -9,6 +9,7 @@
 
 from n3fit.hyper_optimization import penalties as penalties_module
 from n3fit.hyper_optimization import rewards as rewards_module
+from n3fit.hyper_optimization.rewards import HyperLoss
 from reportengine.checks import CheckError, make_argcheck
 from validphys.core import PDF
 from validphys.pdfbases import check_basis
@@ -255,15 +256,33 @@ def check_kfold_options(kfold):
             raise CheckError(
                 f"The penalty '{penalty}' is not recognized, ensure it is implemented in hyper_optimization/penalties.py"
             )
-    loss_target = kfold.get("target")
-    if loss_target is not None:
-        if not hasattr(rewards_module, loss_target):
+
+    loss_type = kfold.get("loss_type")
+    if loss_type is not None:
+        if loss_type not in HyperLoss().implemented_losses:
+            raise CheckError(
+                f"Loss type '{loss_type}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py."
+                "Options so far are 'chi2' or 'phi2'."
+            )
+    replica_statistic = kfold.get("replica_statistic")
+    if replica_statistic is not None:
+        if replica_statistic not in HyperLoss().implemented_stats:
             raise CheckError(
-                f"The hyperoptimization target '{loss_target}' loss is not recognized, "
-                "ensure it is implemented in hyper_optimization/rewards.py"
+                f"The replica statistic '{replica_statistic}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
             )
+    fold_statistic = kfold.get("fold_statistic")
+    if fold_statistic is not None:
+        if fold_statistic not in HyperLoss().implemented_stats:
+            raise CheckError(
+                f"The fold statistic '{fold_statistic}' is not recognized, "
+                "ensure it is implemented in the HyperLoss class in hyper_optimization/rewards.py"
+            )
+
     partitions = kfold["partitions"]
     # Check specific errors for specific targets
+    loss_target = kfold.get("fold_statistic")  # TODO: haven't updated this
     if loss_target == "fit_future_tests":
         if len(partitions) == 1:
             raise CheckError("Cannot use target 'fit_future_tests' with just one partition")
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
index 174e921677..eec8242b73 100644
--- a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
+++ b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -14,6 +14,7 @@
 """
 import copy
 import logging
+from typing import Callable
 
 import hyperopt
 import numpy as np
@@ -23,6 +24,11 @@
 
 log = logging.getLogger(__name__)
 
+# Hyperopt uses these strings for a passed and failed run
+# it also has statusses "new", "running" and "suspended", but we don't use them
+HYPEROPT_STATUSSES = {True: "ok", False: "fail"}
+
+
 HYPEROPT_SEED = 42
 
 
@@ -113,7 +119,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
         parameters of the best trial as found by ``hyperopt``
     """
     # Tell the trainer we are doing hpyeropt
-    model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK)
+    model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys)
     # Generate the trials object
     trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
     # Initialize seed for hyperopt
@@ -127,7 +133,7 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
 
     # Perform the scan
     best = hyperopt.fmin(
-        fn=model_trainer.hyperparametrizable,
+        fn=_status_wrapper(model_trainer.hyperparametrizable),
         space=hyperscanner.as_dict(),
         algo=hyperopt.tpe.suggest,
         max_evals=max_evals,
@@ -139,6 +145,19 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     return hyperscanner.space_eval(best)
 
 
+def _status_wrapper(hyperparametrizable: Callable) -> Callable:
+    """
+    Wrapper that just converts the "status" value to hyperopt's conventions.
+    """
+
+    def wrapped(*args, **kwargs):
+        results_dict = hyperparametrizable(*args, **kwargs)
+        results_dict["status"] = HYPEROPT_STATUSSES[results_dict["status"]]
+        return results_dict
+
+    return wrapped
+
+
 class ActivationStr:
     """
     Upon call this class returns an array where the activation function
diff --git a/n3fit/src/n3fit/hyper_optimization/penalties.py b/n3fit/src/n3fit/hyper_optimization/penalties.py
index 77a156cd3a..4afb79b4f9 100644
--- a/n3fit/src/n3fit/hyper_optimization/penalties.py
+++ b/n3fit/src/n3fit/hyper_optimization/penalties.py
@@ -40,6 +40,11 @@ def saturation(pdf_model=None, n=100, min_x=1e-6, max_x=1e-4, flavors=None, **_k
         flavors: list(int)
             indices of the flavors to inspect
 
+    Returns
+    -------
+        NDArray
+            array of saturation penalties for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import saturation
@@ -72,8 +77,9 @@ def saturation(pdf_model=None, n=100, min_x=1e-6, max_x=1e-4, flavors=None, **_k
     return extra_loss
 
 
-def patience(stopping_object=None, alpha=1e-4, **_kwargs):
-    """Adds a penalty for fits that have finished too soon, which
+def patience(stopping_object, alpha: float = 1e-4, **_kwargs):
+    """
+    Adds a penalty for fits that have finished too soon, which
     means the number of epochs or its patience is not optimal.
     The penalty is proportional to the validation loss and will be 0
     when the best epoch is exactly at max_epoch - patience
@@ -85,6 +91,11 @@ def patience(stopping_object=None, alpha=1e-4, **_kwargs):
         alpha: float
             dumping factor for the exponent
 
+    Returns
+    -------
+        NDArray
+            patience penalty for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import patience
@@ -94,11 +105,11 @@ def patience(stopping_object=None, alpha=1e-4, **_kwargs):
     3.434143467595683
 
     """
-    epoch_best = np.take(stopping_object.e_best_chi2, 0)
+    epoch_best = np.array(stopping_object.e_best_chi2)
     patience = stopping_object.stopping_patience
     max_epochs = stopping_object.total_epochs
     diff = abs(max_epochs - patience - epoch_best)
-    vl_loss = np.take(stopping_object.vl_chi2, 0)
+    vl_loss = np.array(stopping_object.vl_chi2)
     return vl_loss * np.exp(alpha * diff)
 
 
@@ -109,6 +120,11 @@ def integrability(pdf_model=None, **_kwargs):
 
     The penalty increases exponentially with the growth of the integrability number
 
+    Returns
+    -------
+        NDArray
+            array of integrability penalties for each replica
+
     Example
     -------
     >>> from n3fit.hyper_optimization.penalties import integrability
@@ -121,8 +137,19 @@ def integrability(pdf_model=None, **_kwargs):
     """
     pdf_instance = N3PDF(pdf_model.split_replicas())
     integ_values = integrability_numbers(pdf_instance)
-    integ_overflow = np.sum(integ_values[integ_values > fitveto.INTEG_THRESHOLD])
-    if integ_overflow > 50.0:
-        # before reaching an overflow, just give a stupidly big number
-        return np.exp(50.0)
+
+    # set components under the threshold to 0
+    integ_values[integ_values <= fitveto.INTEG_THRESHOLD] = 0.0
+
+    # sum over flavours
+    integ_overflow = np.sum(integ_values, axis=-1)  # -1 rather than 1 so it works with 1 replica
+
+    # Limit components to 50 to avoid overflow
+    if isinstance(integ_overflow, np.ndarray):
+        # Case: multi-replica scenario
+        integ_overflow[integ_overflow > 50.0] = 50.0
+    elif isinstance(integ_overflow, (float, np.float64)):
+        # Case: single replica scenario
+        integ_overflow = min(integ_overflow, 50.0)
+
     return np.exp(integ_overflow) - 1.0
diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
index 0587a47649..af97aa5bf7 100644
--- a/n3fit/src/n3fit/hyper_optimization/rewards.py
+++ b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -24,30 +24,300 @@
     std: 0.8925
 
 """
+import logging
+from typing import Callable, Dict, List
+
 import numpy as np
 
-from n3fit.vpinterface import N3PDF
+from n3fit.backends import MetaModel
+from n3fit.vpinterface import N3PDF, compute_phi2
+from validphys.core import DataGroupSpec
 from validphys.pdfgrids import distance_grids, xplotting_grid
 
+log = logging.getLogger(__name__)
+
 
 def _pdfs_to_n3pdfs(pdfs_per_fold):
     """Convert a list of multi-replica PDFs to a list of N3PDFs"""
     return [N3PDF(pdf.split_replicas(), name=f"fold_{k}") for k, pdf in enumerate(pdfs_per_fold)]
 
 
-def average(fold_losses=None, **_kwargs):
-    """Returns the average of fold losses"""
-    return np.average(fold_losses)
-
-
-def best_worst(fold_losses=None, **_kwargs):
-    """Returns the maximum loss of all k folds"""
-    return np.max(fold_losses)
-
+class HyperLoss:
+    """
+    Class to compute the hyper_loss based on the individual replica losses.
+
+    Computes the statistic over the replicas and then over the folds, both
+    statistics default to the average.
+
+    Parameters
+    ----------
+        loss_type: str
+            the type of loss over the replicas to use.
+            Options are "chi2" and "phi2".
+        replica_statistic: str
+            the statistic over the replicas to use, for per replica losses.
+            Options are "average", "best_worst", and "std".
+        fold_statistic: str
+            the statistic over the folds to use.
+            Options are "average", "best_worst", and "std".
+    """
 
-def std(fold_losses=None, **_kwargs):
-    """Return the standard dev of the losses of the folds"""
-    return np.std(fold_losses)
+    def __init__(
+        self, loss_type: str = None, replica_statistic: str = None, fold_statistic: str = None
+    ):
+        self.implemented_stats = {
+            "average": self._average,
+            "best_worst": self._best_worst,
+            "std": self._std,
+        }
+        self.implemented_losses = ["chi2", "phi2"]
+
+        self._default_statistic = "average"
+        self._default_loss = "chi2"
+
+        self.loss_type = self._parse_loss(loss_type)
+        self.reduce_over_replicas = self._parse_statistic(replica_statistic, "replica_statistic")
+        self.reduce_over_folds = self._parse_statistic(fold_statistic, "fold_statistic")
+
+        self.phi2_vector = []
+        self.chi2_matrix = []
+
+        self.penalties = {}
+
+    def compute_loss(
+        self,
+        penalties: Dict[str, np.ndarray],
+        experimental_loss: np.ndarray,
+        pdf_model: MetaModel,
+        experimental_data: List[DataGroupSpec],
+        fold_idx: int = 0,
+    ) -> float:
+        """
+        Compute the loss, including added penalties, for a single fold.
+
+        Parameters
+        ----------
+            penalties: Dict[str, NDArray(replicas)]
+                Dict of penalties for each replica.
+            experimental_loss: NDArray(replicas)
+                Experimental loss for each replica.
+            pdf_model: :class:`n3fit.backends.MetaModel`
+                N3fitted meta-model.
+            experimental_data: List[validphys.core.DataGroupSpec]
+                List of tuples containing `validphys.core.DataGroupSpec` instances for each group data set
+            fold_idx: int
+                k-fold index. Defaults to 0.
+
+        Returns
+        -------
+            loss: float
+                The computed loss over the replicas.
+
+        Example
+        -------
+        >>> import numpy as np
+        >>> from n3fit.hyper_optimization.rewards import HyperLoss
+        >>> from n3fit.model_gen import generate_pdf_model
+        >>> from validphys.loader import Loader
+        >>> hyper = HyperLoss(loss_type="chi2", replica_statistic="average", fold_statistic="average")
+        >>> penalties = {'saturation': np.array([1.0, 5.0])}
+        >>> experimental_loss = np.array([0.1, 0.2])
+        >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
+        >>> experimental_data = [Loader().check_experiment("My DataGroupSpec", [ds])]
+        >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'g', 's', 'sbar']]
+        >>> pdf_model = generate_pdf_model(nodes=[8], activations=['linear'], seed=0, num_replicas=2, flav_info=fake_fl, fitbasis="FLAVOUR")
+        >>> loss = hyper.compute_loss(penalties, experimental_loss, pdf_model, experimental_data)
+        """
+        # calculate phi2 for a given k-fold using vpinterface and validphys
+        phi2_per_fold = compute_phi2(N3PDF(pdf_model.split_replicas()), experimental_data)
+
+        # update hyperopt metrics
+        # these are saved in the phi2_vector and chi2_matrix attributes, excluding penalties
+        self._save_hyperopt_metrics(phi2_per_fold, experimental_loss, penalties, fold_idx)
+
+        # include penalties to experimental loss
+        # this allows introduction of statistics also to penalties
+        experimental_loss_w_penalties = experimental_loss + sum(penalties.values())
+
+        # add penalties to phi2 in the form of a sum of per-replicas averages
+        phi2_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
+
+        # define loss for hyperopt according to the chosen loss_type
+        if self.loss_type == "chi2":
+            # calculate statistics of chi2 over replicas for a given k-fold
+            loss = self.reduce_over_replicas(experimental_loss_w_penalties)
+        elif self.loss_type == "phi2":
+            loss = phi2_per_fold
+
+        return loss
+
+    def _save_hyperopt_metrics(
+        self,
+        phi2_per_fold: float,
+        chi2_per_fold: np.ndarray,
+        penalties: Dict[str, np.ndarray],
+        fold_idx: int = 0,
+    ) -> None:
+        """
+        Save all chi2 and phi2 calculated metrics per replica and per fold, including penalties.
+
+        Parameters
+        ----------
+            phi2_per_fold: float
+                Computed phi2 for a given k-fold
+            chi2_per_fold: np.ndarray
+                Computed chi2 for each replica for a given k-fold
+            penalties: Dict[str, np.ndarray]
+                dictionary of all penalties with their names
+            fold_idx: int
+                k-fold index. Defaults to 0.
+        """
+        # reset chi2 and phi2 arrays for every trial
+        if fold_idx == 0:
+            self.phi2_vector = []
+            self.chi2_matrix = []
+            self.penalties = {}
+
+        # populate chi2 matrix and phi2 vector calculated for a given k-fold
+        self.chi2_matrix.append(chi2_per_fold)
+        self.phi2_vector.append(phi2_per_fold)
+
+        # save penalties per replica for a given k-fold
+        for name, values in penalties.items():
+            temp = self.penalties.get(name, [])
+            temp.append(values)
+            self.penalties[name] = temp
+
+    def _parse_loss(self, loss_type: str) -> str:
+        """
+        Parse the type of loss and return the default if None.
+
+        Parameters
+        ----------
+            loss_type: str
+                The loss type to parse.
+
+        Returns
+        -------
+            loss_type: str
+                The parsed loss type.
+
+        Raises
+        ------
+            ValueError: If an invalid loss type is provided.
+        """
+        if loss_type is None:
+            loss_type = self._default_loss
+            log.warning(f"No loss_type selected in HyperLoss, defaulting to {loss_type}")
+        else:
+            if loss_type not in self.implemented_losses:
+                valid_options = ", ".join(self.implemented_losses)
+                raise ValueError(
+                    f"Invalid loss type '{loss_type}'. Valid options are: {valid_options}"
+                )
+
+        log.info(f"Setting '{loss_type}' as the loss type for hyperoptimization")
+
+        return loss_type
+
+    def _parse_statistic(self, statistic: str, name: str) -> Callable:
+        """
+        Parse the statistic and return the default if None.
+
+        Parameters
+        ----------
+            statistic: str
+                The statistic to parse.
+            name: str
+                The name of the statistic.
+
+        Returns
+        -------
+            Callable: The parsed statistic method.
+
+        Raises
+        ------
+            ValueError: If an invalid statistic is provided.
+
+        Notes
+        -----
+            For loss type equal to phi2, the applied fold statistics is always the reciprocal of the selected stats.
+        """
+        if statistic is None:
+            statistic = self._default_statistic
+            log.warning(f"No {name} selected in HyperLoss, defaulting to {statistic}")
+        else:
+            if statistic not in self.implemented_stats:
+                valid_options = ", ".join(self.implemented_stats.keys())
+                raise ValueError(
+                    f"Invalid {name} '{statistic}'. Valid options are: {valid_options}"
+                )
+
+        log.info(f"Using '{statistic}' as the {name} for hyperoptimization")
+
+        selected_statistic = self.implemented_stats[statistic]
+
+        if self.loss_type == "chi2":
+            return selected_statistic
+
+        elif self.loss_type == "phi2":
+            # In case of phi2, calculate the inverse of the applied statistics
+            # This is only used when calculating statistics over folds
+            return lambda x: np.reciprocal(selected_statistic(x))
+
+    @staticmethod
+    def _average(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the average of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the mean is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The average along the specified axis.
+        """
+        return np.average(fold_losses, axis=axis).item()
+
+    @staticmethod
+    def _best_worst(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the maximum value of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the maximum is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The maximum value along the specified axis.
+        """
+        return np.max(fold_losses, axis=axis).item()
+
+    @staticmethod
+    def _std(fold_losses: np.ndarray, axis: int = 0) -> np.ndarray:
+        """
+        Compute the standard deviation of the input array along the specified axis.
+
+        Parameters
+        ----------
+            fold_losses: np.ndarray
+                Input array.
+            axis: int, optional
+                Axis along which the standard deviation is computed. Default is 0.
+
+        Returns
+        -------
+            np.ndarray: The standard deviation along the specified axis.
+        """
+        return np.std(fold_losses, axis=axis).item()
 
 
 def fit_distance(pdfs_per_fold=None, **_kwargs):
diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 219bbdfc11..2f69fbb275 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -73,7 +73,9 @@ def _generate_loss(self, mask=None):
         if self.invcovmat is not None:
             if self.rotation:
                 # If we have a matrix diagonal only, padd with 0s and hope it's not too heavy on memory
-                invcovmat_matrix = np.eye(self.invcovmat.shape[-1]) * self.invcovmat[..., np.newaxis]
+                invcovmat_matrix = (
+                    np.eye(self.invcovmat.shape[-1]) * self.invcovmat[..., np.newaxis]
+                )
                 if self.covmat is not None:
                     covmat_matrix = np.eye(self.covmat.shape[-1]) * self.covmat[..., np.newaxis]
                 else:
@@ -82,11 +84,7 @@ def _generate_loss(self, mask=None):
                 covmat_matrix = self.covmat
                 invcovmat_matrix = self.invcovmat
             loss = losses.LossInvcovmat(
-                invcovmat_matrix,
-                self.data,
-                mask,
-                covmat=covmat_matrix,
-                name=self.name
+                invcovmat_matrix, self.data, mask, covmat=covmat_matrix, name=self.name
             )
         elif self.positivity:
             loss = losses.LossPositivity(name=self.name, c=self.multiplier)
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index 59acff377a..8db9acfaec 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -19,8 +19,11 @@
 from n3fit.backends import operations as op
 import n3fit.hyper_optimization.penalties
 import n3fit.hyper_optimization.rewards
+from n3fit.hyper_optimization.rewards import HyperLoss
 from n3fit.scaler import generate_scaler
 from n3fit.stopping import Stopping
+from n3fit.vpinterface import N3PDF, compute_phi2
+from validphys.core import DataGroupSpec
 from validphys.photon.compute import Photon
 
 log = logging.getLogger(__name__)
@@ -88,14 +91,13 @@ class ModelTrainer:
 
     def __init__(
         self,
+        experiments_data,
         exp_info,
         pos_info,
         integ_info,
         flavinfo,
         fitbasis,
         nnseeds,
-        pass_status="ok",
-        failed_status="fail",
         debug=False,
         kfold_parameters=None,
         max_cores=None,
@@ -103,11 +105,13 @@ def __init__(
         sum_rules=None,
         theoryid=None,
         lux_params=None,
-        replica_idxs=None,
+        replicas=None,
     ):
         """
         Parameters
         ----------
+            experiments_data: list
+                list of `validphys.core.DataGroupSpec` containing experiments
             exp_info: list
                 list of dictionaries containing experiments
             pos_info: list
@@ -120,10 +124,6 @@ def __init__(
                 the name of the basis being fitted
             nnseeds: list(int)
                 the seed used to initialise the NN for each model to be passed to model_gen
-            pass_status: str
-                flag to signal a good run
-            failed_status: str
-                flag to signal a bad run
             debug: bool
                 flag to activate some debug options
             kfold_parameters: dict
@@ -149,14 +149,13 @@ def __init__(
         self.flavinfo = flavinfo
         self.fitbasis = fitbasis
         self._nn_seeds = nnseeds
-        self.pass_status = pass_status
-        self.failed_status = failed_status
         self.debug = debug
         self.all_datasets = []
         self._scaler = None
         self.theoryid = theoryid
         self.lux_params = lux_params
-        self.replica_idxs = replica_idxs
+        self.replicas = replicas
+        self.experiments_data = experiments_data
 
         # Initialise internal variables which define behaviour
         if debug:
@@ -182,12 +181,14 @@ def __init__(
                 self.hyper_penalties.append(pen_fun)
                 log.info("Adding penalty: %s", penalty)
             # Check what is the hyperoptimization target function
-            hyper_loss = kfold_parameters.get("target", None)
-            if hyper_loss is None:
-                hyper_loss = "average"
-                log.warning("No minimization target selected, defaulting to '%s'", hyper_loss)
-            log.info("Using '%s' as the target for hyperoptimization", hyper_loss)
-            self._hyper_loss = getattr(n3fit.hyper_optimization.rewards, hyper_loss)
+            replica_statistic = kfold_parameters.get("replica_statistic", None)
+            fold_statistic = kfold_parameters.get("fold_statistic", None)
+            loss_type = kfold_parameters.get("loss_type", None)
+            self._hyper_loss = HyperLoss(
+                loss_type=loss_type,
+                replica_statistic=replica_statistic,
+                fold_statistic=fold_statistic,
+            )
 
         # Initialize the dictionaries which contain all fitting information
         self.input_list = []
@@ -229,9 +230,8 @@ def __init__(
         if debug:
             self.callbacks.append(callbacks.TimerCallback())
 
-    def set_hyperopt(self, hyperopt_on, keys=None, status_ok="ok"):
+    def set_hyperopt(self, hyperopt_on, keys=None):
         """Set hyperopt options on and off (mostly suppresses some printing)"""
-        self.pass_status = status_ok
         if keys is None:
             keys = []
         self._hyperkeys = keys
@@ -684,7 +684,7 @@ def _generate_pdf(
             regularizer_args=regularizer_args,
             impose_sumrule=self.impose_sumrule,
             scaler=self._scaler,
-            num_replicas=len(self.replica_idxs),
+            num_replicas=len(self.replicas),
             photons=photons,
         )
         return pdf_model
@@ -710,7 +710,7 @@ def _prepare_reporting(self, partition):
             reporting_list.append(reporting_dict)
         return reporting_list
 
-    def _train_and_fit(self, training_model, stopping_object, epochs=100):
+    def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool:
         """
         Trains the NN for the number of epochs given using
         stopping_object as the stopping criteria
@@ -740,9 +740,8 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100):
 
         # TODO: in order to use multireplica in hyperopt is is necessary to define what "passing" means
         # for now consider the run as good if any replica passed
-        if any(bool(i) for i in stopping_object.e_best_chi2):
-            return self.pass_status
-        return self.failed_status
+        fit_has_passed = any(bool(i) for i in stopping_object.e_best_chi2)
+        return fit_has_passed
 
     def _hyperopt_override(self, params):
         """Unrolls complicated hyperopt structures into very simple dictionaries"""
@@ -798,6 +797,43 @@ def evaluate(self, stopping_object):
         exp_chi2 = self.experimental["model"].compute_losses()["loss"] / self.experimental["ndata"]
         return train_chi2, val_chi2, exp_chi2
 
+    def _filter_datagroupspec(self, datasets_partition):
+        """Takes a list of all input exp datasets as :class:`validphys.core.DataGroupSpec`
+        and select `DataSetSpec`s whose names are in datasets_partition.
+
+        Parameters
+        ----------
+            datasets_partition: List[str]
+                List with names of the datasets you want to select.
+
+        Returns
+        -------
+            filtered_datagroupspec: List[validphys.core.DataGroupSpec]
+                List of filtered exp datasets whose names are in datasets_partition.
+        """
+        filtered_datagroupspec = []
+
+        # self.experiments_data is composed of a list of `DataGroupSpec` objects
+        # These represent a group of related exp data sets
+        # Loop over this list
+        for datagroup in self.experiments_data:
+            filtered_datasetspec = []
+
+            # Each `DataGroupSpec` is composed by several `DataSetSpec` objects
+            # `DataSetSpec` represents each exp dataset
+            # Now, loop over them
+            for dataset in datagroup.datasets:
+                # Include `DataSetSpec`s whose names are in datasets_partition
+                if dataset.name in datasets_partition:
+                    filtered_datasetspec.append(dataset)
+
+            # List of filtered experiments as `DataGroupSpec`
+            filtered_datagroupspec.append(
+                DataGroupSpec(name=f"{datagroup.name}_exp", datasets=filtered_datasetspec)
+            )
+
+        return filtered_datagroupspec
+
     def hyperparametrizable(self, params):
         """
         Wrapper around all the functions defining the fit.
@@ -853,6 +889,8 @@ def hyperparametrizable(self, params):
         # And lists to save hyperopt utilities
         pdfs_per_fold = []
         exp_models = []
+        # phi2 evaluated over training/validation exp data
+        trvl_phi2_per_fold = []
 
         # Generate the grid in x, note this is the same for all partitions
         xinput = self._xgrid_generation()
@@ -860,7 +898,7 @@ def hyperparametrizable(self, params):
         # Initialize all photon classes for the different replicas:
         if self.lux_params:
             photons = Photon(
-                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replica_idxs
+                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas
             )
         else:
             photons = None
@@ -937,33 +975,61 @@ def hyperparametrizable(self, params):
             passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs)
 
             if self.mode_hyperopt:
-                # If doing a hyperparameter scan we need to keep track of the loss function
-                # Since hyperopt needs _one_ number take the average in case of many replicas
-                validation_loss = np.mean(stopping_object.vl_chi2)
+                if not passed:
+                    log.info("Hyperparameter combination fail to find a good fit, breaking")
+                    break
+
+                validation_loss = stopping_object.vl_chi2
 
-                # Compute experimental loss
-                exp_loss_raw = np.average(models["experimental"].compute_losses()["loss"])
-                # And divide by the number of active points in this fold
+                # number of active points in this fold
                 # it would be nice to have a ndata_per_fold variable coming in the vp object...
                 ndata = np.sum([np.count_nonzero(i[k]) for i in self.experimental["folds"]])
                 # If ndata == 0 then it's the opposite, all data is in!
                 if ndata == 0:
                     ndata = self.experimental["ndata"]
+
+                # Compute experimental loss, over excluded datasets
+                exp_loss_raw = models["experimental"].compute_losses()["loss"]
                 experimental_loss = exp_loss_raw / ndata
 
-                hyper_loss = experimental_loss
-                if passed != self.pass_status:
-                    log.info("Hyperparameter combination fail to find a good fit, breaking")
-                    # If the fit failed to fit, no need to add a penalty to the loss
-                    break
-                for penalty in self.hyper_penalties:
-                    hyper_loss += penalty(pdf_model=pdf_model, stopping_object=stopping_object)
+                # Compute penalties per replica
+                penalties = {
+                    penalty.__name__: penalty(pdf_model=pdf_model, stopping_object=stopping_object)
+                    for penalty in self.hyper_penalties
+                }
+
+                # Extracting the necessary data to compute phi2
+                # First, create a list of `validphys.core.DataGroupSpec`
+                # containing only exp datasets within the held out fold
+                experimental_data = self._filter_datagroupspec(partition["datasets"])
+
+                # Compute per replica hyper losses
+                hyper_loss = self._hyper_loss.compute_loss(
+                    penalties=penalties,
+                    experimental_loss=experimental_loss,
+                    pdf_model=pdf_model,
+                    experimental_data=experimental_data,
+                    fold_idx=k,
+                )
+
                 log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
 
+                # Create another list of `validphys.core.DataGroupSpec`
+                # containing now exp datasets that are included in the training/validation dataset
+                trvl_partitions = list(self.kpartitions)
+                trvl_partitions.pop(k)
+                trvl_exp_names = [
+                    exp_name for item in trvl_partitions for exp_name in item['datasets']
+                ]
+                trvl_data = self._filter_datagroupspec(trvl_exp_names)
+                # evaluate phi2 on training/validation exp set
+                trvl_phi2 = compute_phi2(N3PDF(pdf_model.split_replicas()), trvl_data)
+
                 # Now save all information from this fold
                 l_hyper.append(hyper_loss)
                 l_valid.append(validation_loss)
                 l_exper.append(experimental_loss)
+                trvl_phi2_per_fold.append(trvl_phi2)
                 pdfs_per_fold.append(pdf_model)
                 exp_models.append(models["experimental"])
 
@@ -981,20 +1047,32 @@ def hyperparametrizable(self, params):
             # endfor
 
         if self.mode_hyperopt:
+            # turn losses into arrays
+            l_hyper = np.array(l_hyper)
+            l_valid = np.array(l_valid)
+            l_exper = np.array(l_exper)
+
+            # Compute the loss over all folds for hyperopt
+            final_hyper_loss = self._hyper_loss.reduce_over_folds(l_hyper)
+
             # Hyperopt needs a dictionary with information about the losses
             # it is possible to store arbitrary information in the trial file
             # by adding it to this dictionary
             dict_out = {
                 "status": passed,
-                "loss": self._hyper_loss(
-                    fold_losses=l_hyper, pdfs_per_fold=pdfs_per_fold, experimental_models=exp_models
-                ),
+                "loss": final_hyper_loss,
                 "validation_loss": np.average(l_valid),
                 "experimental_loss": np.average(l_exper),
                 "kfold_meta": {
                     "validation_losses": l_valid,
+                    "validation_losses_phi2": np.array(trvl_phi2_per_fold),
                     "experimental_losses": l_exper,
-                    "hyper_losses": l_hyper,
+                    "hyper_losses": np.array(self._hyper_loss.chi2_matrix),
+                    "hyper_losses_phi2": np.array(self._hyper_loss.phi2_vector),
+                    "penalties": {
+                        name: np.array(values)
+                        for name, values in self._hyper_loss.penalties.items()
+                    },
                 },
             }
             return dict_out
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index 6514029ca2..9447ad9279 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -21,6 +21,7 @@
 @n3fit.checks.check_fiatlux_pdfs_id
 def performfit(
     *,
+    experiments_data,
     n3fit_checks_action,  # wrapper for all checks
     replicas,  # checks specific to performfit
     replicas_nnseed_fitting_data_dict,
@@ -74,6 +75,8 @@ def performfit(
         data: validphys.core.DataGroupSpec
             containing the datasets to be included in the fit. (Only used
             for checks)
+        experiments_data: list[validphys.core.DataGroupSpec]
+            similar to `data` but now passed as argument to `ModelTrainer`
         replicas_nnseed_fitting_data_dict: list[tuple]
             list with element for each replica (typically just one) to be
             fitted. Each element
@@ -181,6 +184,7 @@ def performfit(
         # Generate a ModelTrainer object
         # this object holds all necessary information to train a PDF (up to the NN definition)
         the_model_trainer = ModelTrainer(
+            experiments_data,
             exp_info,
             posdatasets_fitting_pos_dict,
             integdatasets_fitting_integ_dict,
@@ -194,7 +198,7 @@ def performfit(
             sum_rules=sum_rules,
             theoryid=theoryid,
             lux_params=fiatlux,
-            replica_idxs=replica_idxs,
+            replicas=replica_idxs,
         )
 
         # This is just to give a descriptive name to the fit function
diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py
index cecc747452..965c9773b7 100644
--- a/n3fit/src/n3fit/tests/test_hyperopt.py
+++ b/n3fit/src/n3fit/tests/test_hyperopt.py
@@ -6,17 +6,99 @@
 import shutil
 import subprocess as sp
 
+import numpy as np
 from numpy.testing import assert_approx_equal
+import pytest
 
-from n3fit.hyper_optimization import rewards
+from n3fit.hyper_optimization.rewards import HyperLoss
+from n3fit.model_gen import generate_pdf_model
+from validphys.loader import Loader
 
 
-def test_rewards():
-    """Ensure that rewards continue doing what they are supposed to do"""
-    losses = [0.0, 1.0, 2.0]
-    assert_approx_equal(rewards.average(losses), 1.0)
-    assert_approx_equal(rewards.best_worst(losses), 2.0)
-    assert_approx_equal(rewards.std(losses), 0.816496580927726)
+def generate_pdf(seed, num_replicas):
+    """Generate generic pdf model."""
+    fake_fl = [
+        {"fl": i, "largex": [0, 1], "smallx": [1, 2]}
+        for i in ["u", "ubar", "d", "dbar", "c", "g", "s", "sbar"]
+    ]
+    pdf_model = generate_pdf_model(
+        nodes=[8],
+        activations=["linear"],
+        seed=seed,
+        num_replicas=num_replicas,
+        flav_info=fake_fl,
+        fitbasis="FLAVOUR",
+    )
+    return pdf_model
+
+
+def get_experimental_data(dataset_name="NMC", theoryid=399):
+    """Get experimental data set using validphys.
+
+    Returns a list defined by the data set as
+    `validphys.core.DataGroupSpec`.
+    """
+    loader = Loader()
+    ds = loader.check_dataset(dataset_name, theoryid=theoryid, cuts="internal")
+    return loader.check_experiment("My DataGroupSpec", [ds])
+
+
+@pytest.mark.parametrize(
+    "loss_type, replica_statistic, expected_per_fold_loss",
+    [
+        ("chi2", "average", 0.15),
+        ("chi2", "best_worst", 0.2),
+        ("chi2", "std", 0.05),
+        ("phi2", None, None),
+    ],
+)
+def test_compute_per_fold_loss(loss_type, replica_statistic, expected_per_fold_loss):
+    """Check that the losses per fold are calculated correctly.
+
+    This example assumes a 2 replica calculation with 3 added penalties.
+    """
+    # generate 2 replica pdf model
+    pdf_model = generate_pdf(seed=0, num_replicas=2)
+    # add 3 penalties for a 2 replica model
+    penalties = {
+        'saturation': np.array([0.0, 0.0]),
+        'patience': np.array([0.0, 0.0]),
+        'integrability': np.array([0.0, 0.0]),
+    }
+    # experimental losses for each replica
+    experimental_loss = np.array([0.1, 0.2])
+    # get experimental data to compare with
+    experimental_data = [get_experimental_data()]
+
+    loss = HyperLoss(loss_type=loss_type, replica_statistic=replica_statistic)
+
+    # calculate statistic loss for one specific fold
+    predicted_per_fold_loss = loss.compute_loss(
+        penalties, experimental_loss, pdf_model, experimental_data
+    )
+
+    # Assert
+    if expected_per_fold_loss is not None:
+        assert_approx_equal(predicted_per_fold_loss, expected_per_fold_loss)
+    else:
+        assert predicted_per_fold_loss > 0  # Test for non-negativity
+        assert predicted_per_fold_loss.dtype == np.float64  # Test its type
+        # Add more property-based tests specific to "phi2" if possible
+
+
+def test_loss_reduce_over_folds():
+    """Ensure that the hyper loss statistics over all folds are calculated correctly."""
+    # define losses for 3 folds
+    losses = np.array([1.0, 2.0, 3.0])
+
+    loss_average = HyperLoss(fold_statistic="average")
+    assert_approx_equal(loss_average.reduce_over_folds(losses), 2.0)
+
+    loss_best_worst_best_worst = HyperLoss(fold_statistic="best_worst")
+    assert_approx_equal(loss_best_worst_best_worst.reduce_over_folds(losses), 3.0)
+
+    loss_std = HyperLoss(fold_statistic="std")
+    assert_approx_equal(loss_std.reduce_over_folds(losses), 0.816496580927726)
 
 
 REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
@@ -70,7 +152,7 @@ def test_restart_from_pickle(tmp_path):
     direct_json_path = f"{output_direct}/nnfit/replica_{REPLICA}/tries.json"
     direct_json = load_data(direct_json_path)
 
-    # minimum check: the generated list of nested dictionaries have same lenght
+    # minimum check: the generated list of nested dictionaries have same length
     assert len(restart_json) == len(direct_json)
 
     for i in range(n_trials_total):
diff --git a/n3fit/src/n3fit/vpinterface.py b/n3fit/src/n3fit/vpinterface.py
index 12e5aca374..453ef5738c 100644
--- a/n3fit/src/n3fit/vpinterface.py
+++ b/n3fit/src/n3fit/vpinterface.py
@@ -27,8 +27,10 @@
 from n3fit.backends import PREPROCESSING_LAYER_ALL_REPLICAS
 from validphys.arclength import arc_lengths, integrability_number
 from validphys.core import PDF, MCStats
+from validphys.covmats import covmat_from_systematics, sqrt_covmat
 from validphys.lhapdfset import LHAPDFSet
 from validphys.pdfbases import ALL_FLAVOURS, check_basis
+from validphys.results import abs_chi2_data, phi_data, results
 
 log = logging.getLogger(__name__)
 # Order of the evolution basis output from n3fit
@@ -331,3 +333,57 @@ def compute_arclength(self, q0=1.65, basis="evolution", flavours=None):
         flavours = ["sigma", "gluon", "V", "V3", "V8"]
     ret = arc_lengths(self, [q0], basis, flavours)
     return ret.stats.central_value()
+
+
+def compute_phi2(n3pdf, experimental_data):
+    """Compute phi2 using validphys functions.
+
+    For more info on how phi is calculated; see Eq.(4.6) of 10.1007/JHEP04(2015)040
+
+    Parameters
+    ----------
+        n3pdfs: :class:`n3fit.vpinterface.N3PDF`
+            `N3PDF` instance defining the n3fitted multi-replica PDF
+        experimental_data: List[validphys.core.DataGroupSpec]
+            List of experiment group datasets as `DataGroupSpec` instances
+
+    Returns
+    -------
+        sum_phi2: float
+            Sum of phi2 over all experimental group datasets
+
+    Example
+    -------
+    >>> from n3fit.vpinterface import N3PDF, compute_phi2
+    >>> from n3fit.model_gen import generate_pdf_model
+    >>> from validphys.loader import Loader
+    >>> fake_fl = [{'fl' : i, 'largex' : [0,1], 'smallx': [1,2]} for i in ['u', 'ubar', 'd', 'dbar', 'c', 'g', 's', 'sbar']]
+    >>> pdf_model = generate_pdf_model(nodes=[8], activations=['linear'], seed=0, num_replicas=2, flav_info=fake_fl, fitbasis="FLAVOUR")
+    >>> n3pdf = N3PDF(pdf_model.split_replicas())
+    >>> ds = Loader().check_dataset("NMC", theoryid=399, cuts="internal")
+    >>> data_group_spec = Loader().check_experiment("My DataGroupSpec", [ds])
+    >>> phi2 = compute_phi2(n3pdf, [data_group_spec])
+    """
+    sum_phi2 = 0.0
+    # Loop over the list of `DataGroupSpec` objects
+    for datagroupspec in experimental_data:
+        # datagroupspec is an instance of `DataGroupSpec`
+
+        # Loop over `DataGroupSpec` datasets
+        for datasetspec in datagroupspec.datasets:
+            # datasetspec is an instance of `DataSetSpec`
+
+            # get covariant matrix for each `DataSetSpec`
+            covmat = covmat_from_systematics(datasetspec.load_commondata(), datasetspec)
+
+            # get experiment (`DataResult`) and theory (`ThPredictionsResult`) predictions
+            res = results(datasetspec, n3pdf, covmat, sqrt_covmat(covmat))
+
+            # calculate standard chi2 (all_chi2) and chi2 using PDF central values (central_chi2)
+            chi2 = abs_chi2_data(res)
+
+            # calculate phi and store phi**2
+            phi, _ = phi_data(chi2)
+            sum_phi2 += phi**2
+
+    return sum_phi2
diff --git a/validphys2/src/validphys/commondata.py b/validphys2/src/validphys/commondata.py
index 1bbd7c0c2f..5c0b534d75 100644
--- a/validphys2/src/validphys/commondata.py
+++ b/validphys2/src/validphys/commondata.py
@@ -6,9 +6,11 @@
 :py:mod:`validphys.coredata`
 
 """
+import functools
+
 from reportengine import collect
 from validphys.commondataparser import load_commondata
-import functools
+
 
 @functools.lru_cache
 def loaded_commondata_with_cuts(commondata, cuts):
diff --git a/validphys2/src/validphys/covmats.py b/validphys2/src/validphys/covmats.py
index 1a4f050eca..102745920a 100644
--- a/validphys2/src/validphys/covmats.py
+++ b/validphys2/src/validphys/covmats.py
@@ -1,8 +1,8 @@
 """Module for handling logic and manipulation of covariance and correlation
 matrices on different levels of abstraction
 """
-import logging
 import functools
+import logging
 
 import numpy as np
 import pandas as pd
diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py
index 566b372c6d..d94a9e4372 100644
--- a/validphys2/src/validphys/filters.py
+++ b/validphys2/src/validphys/filters.py
@@ -3,10 +3,10 @@
 """
 
 from collections.abc import Mapping
+import functools
 from importlib.resources import read_text
 import logging
 import re
-import functools
 
 import numpy as np
 
diff --git a/validphys2/src/validphys/n3fit_data_utils.py b/validphys2/src/validphys/n3fit_data_utils.py
index fe908d73cb..5575ebfcaf 100644
--- a/validphys2/src/validphys/n3fit_data_utils.py
+++ b/validphys2/src/validphys/n3fit_data_utils.py
@@ -8,6 +8,7 @@
 loading their fktables (and applying any necessary cuts).
 """
 import dataclasses
+import functools
 from itertools import zip_longest
 
 import numpy as np
diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py
index 3471cfb954..477d302212 100644
--- a/validphys2/src/validphys/utils.py
+++ b/validphys2/src/validphys/utils.py
@@ -9,11 +9,11 @@
 import pathlib
 import shutil
 import tempfile
-from typing import Any, Sequence, Mapping, Hashable
+from typing import Any, Hashable, Mapping, Sequence
 
+from frozendict import frozendict
 import numpy as np
 from validobj import ValidationError, parse_input
-from frozendict import frozendict
 
 
 def make_hashable(obj: Any):
@@ -34,11 +34,13 @@ def freeze_args(func):
     Into immutable
     Useful to be compatible with cache
     """
+
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         args = tuple([make_hashable(arg) for arg in args])
         kwargs = {k: make_hashable(v) for k, v in kwargs.items()}
         return func(*args, **kwargs)
+
     return wrapped