Join models into one observables model and 3 losses

NNPDF · Feb 22, 2024 · f696bfd · f696bfd
1 parent faadba0
commit f696bfd
Show file tree

Hide file tree

Showing 9 changed files with 441 additions and 410 deletions.
diff --git a/n3fit/src/n3fit/backends/__init__.py b/n3fit/src/n3fit/backends/__init__.py
@@ -18,5 +18,6 @@
     set_eager,
     set_initial_state,
 )
+from n3fit.backends.keras_backend.metrics import LossMetric
 
 print("Using Keras backend")
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaModel.py b/n3fit/src/n3fit/backends/keras_backend/MetaModel.py
@@ -120,7 +120,6 @@ def __init__(self, input_tensors, output_tensors, scaler=None, input_values=None
         self.single_replica_generator = None
 
         self.target_tensors = None
-        self.compute_losses_function = None
         self._scaler = scaler
 
     @tf.autograph.experimental.do_not_convert
@@ -170,6 +169,7 @@ def perform_fit(self, x=None, y=None, epochs=1, **kwargs):
         x_params = self._parse_input(x)
         if y is None:
             y = self.target_tensors
+        y = {name: np.zeros((1, 1)) for name in self.loss.keys()}
         history = super().fit(x=x_params, y=y, epochs=epochs, **kwargs)
         loss_dict = history.history
         return loss_dict
@@ -180,44 +180,6 @@ def predict(self, x=None, **kwargs):
         result = super().predict(x=x, **kwargs)
         return result
 
-    def compute_losses(self):
-        """
-        This function is equivalent to the model ``evaluate(x,y)`` method of most TensorFlow models
-        which return a dictionary of losses per output layer.
-        The losses reported in the ``evaluate`` method for n3fit are, however, summed over replicas.
-        Instead the loss we are interested in is usually the output of the model (i.e., predict)
-        This function then generates a dict of partial losses of the model separated per replica.
-        i.e., the output for experiment {'LHC_exp'} will be an array of Nrep elements.
-
-        Returns
-        -------
-            dict
-                a dictionary with all partial losses of the model
-        """
-        if self.compute_losses_function is None:
-            # If it is the first time we are passing through, compile the function and save it
-            out_names = [f"{i}_loss" for i in self.output_names]
-            out_names.insert(0, "loss")
-
-            # Compile a evaluation function
-            @tf.function
-            def losses_fun():
-                predictions = self(self._parse_input(None))
-                # If we only have one dataset the output changes
-                if len(out_names) == 2:
-                    predictions = [predictions]
-                total_loss = tf.reduce_sum(predictions, axis=0)
-                ret = [total_loss] + predictions
-                return dict(zip(out_names, ret))
-
-            self.compute_losses_function = losses_fun
-
-        ret = self.compute_losses_function()
-
-        # The output of this function is to be used by python (and numpy)
-        # so we need to convert the tensors
-        return _to_numpy_or_python_type(ret)
-
     def compile(
         self,
         optimizer_name="RMSprop",
@@ -237,7 +199,7 @@ def compile(
             - A ``target_output`` can be defined. If done in this way
                 (for instance because we know the target data will be the same for the whole fit)
                 the data will be compiled together with the model and won't be necessary to
-                input it again when calling the ``perform_fit`` or ``compute_losses`` methods.
+                input it again when calling the ``perform_fit`` method.
 
         Parameters
         ----------
@@ -283,7 +245,7 @@ def compile(
                 target_output = [target_output]
             self.target_tensors = target_output
 
-        super().compile(optimizer=opt, loss=loss)
+        super().compile(optimizer=opt, loss=loss, **kwargs)
 
     def set_masks_to(self, names, val=0.0):
         """Set all mask value to the selected value
@@ -414,6 +376,17 @@ def load_identical_replicas(self, model_file):
         for i_replica in range(self.num_replicas):
             self.set_replica_weights(weights, i_replica)
 
+    def get_all_replica_weights(self):
+        """
+        Get the weights of all the replicas.
+
+        Returns
+        -------
+            list
+                list of dictionaries with the weights of each replica
+        """
+        return [self.get_replica_weights(i) for i in range(self.num_replicas)]
+
 
 def is_stacked_single_replicas(layer):
     """

diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py
@@ -10,9 +10,10 @@
 
 import logging
 from time import time
+
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras.callbacks import TensorBoard, Callback
+from tensorflow.keras.callbacks import Callback, TensorBoard
 
 log = logging.getLogger(__name__)
 
@@ -30,7 +31,7 @@ def __init__(self, count_range=100):
         self.last_time = 0
 
     def on_epoch_end(self, epoch, logs=None):
-        """ At the end of every epoch it checks the time """
+        """At the end of every epoch it checks the time"""
         new_time = time()
         if epoch == 0:
             # The first epoch is only useful for starting
@@ -45,13 +46,13 @@ def on_epoch_end(self, epoch, logs=None):
         self.last_time = new_time
 
     def on_train_end(self, logs=None):
-        """ Print the results """
+        """Print the results"""
         total_time = time() - self.starting_time
         n_times = len(self.all_times)
         # Skip the first 100 epochs to avoid fluctuations due to compilations of part of the code
         # by epoch 100 all parts of the code have usually been called so it's a good compromise
-        mean = np.mean(self.all_times[min(110, n_times-1):])
-        std = np.std(self.all_times[min(110, n_times-1):])
+        mean = np.mean(self.all_times[min(110, n_times - 1) :])
+        std = np.std(self.all_times[min(110, n_times - 1) :])
         log.info(f"> > Average time per epoch: {mean:.5} +- {std:.5} s")
         log.info(f"> > > Total time: {total_time/60:.5} min")
 
@@ -77,7 +78,7 @@ def __init__(self, stopping_object, log_freq=100):
         self.stopping_object = stopping_object
 
     def on_epoch_end(self, epoch, logs=None):
-        """ Function to be called at the end of every epoch """
+        """Function to be called at the end of every epoch"""
         print_stats = ((epoch + 1) % self.log_freq) == 0
         # Note that the input logs correspond to the fit before the weights are updated
         self.stopping_object.monitor_chi2(logs, epoch, print_stats=print_stats)
@@ -103,23 +104,26 @@ class LagrangeCallback(Callback):
             List of the names of the datasets to be trained
         multipliers: list(float)
             List of multipliers to be applied
+        losses: dict
+            Dictionary of losses
         update_freq: int
             each how many epochs the positivity lambda is updated
     """
 
-    def __init__(self, datasets, multipliers, update_freq=100):
+    def __init__(self, datasets, multipliers, losses, update_freq=100):
         super().__init__()
         if len(multipliers) != len(datasets):
             raise ValueError("The number of datasets and multipliers do not match")
         self.update_freq = update_freq
         self.datasets = datasets
         self.multipliers = multipliers
         self.updateable_weights = []
+        self.losses = losses
 
     def on_train_begin(self, logs=None):
-        """ Save an instance of all relevant layers """
+        """Save an instance of all relevant layers"""
         for layer_name in self.datasets:
-            layer = self.model.get_layer(layer_name)
+            layer = self.losses[layer_name]
             self.updateable_weights.append(layer.weights)
 
     @tf.function
@@ -133,7 +137,7 @@ def _update_weights(self):
                 w.assign(w * multiplier)
 
     def on_epoch_end(self, epoch, logs=None):
-        """ Function to be called at the end of every epoch """
+        """Function to be called at the end of every epoch"""
         if (epoch + 1) % self.update_freq == 0:
             self._update_weights()
 

diff --git a/n3fit/src/n3fit/backends/keras_backend/metrics.py b/n3fit/src/n3fit/backends/keras_backend/metrics.py
@@ -0,0 +1,41 @@
+import tensorflow as tf
+from tensorflow.keras.metrics import Metric
+
+import n3fit.backends.keras_backend.operations as op
+
+
+class LossMetric(Metric):
+    """
+    Implementation of the (validation) loss as a metric.
+    Keeps track of per replica loss internally, aggregates just for logging.
+
+    Parameters
+    ----------
+        loss_layer : tf.keras.layers.Layer
+            The loss layer to use for the metric.
+        agg : str
+            Aggregation method to use for the replicas. Can be 'sum' or 'mean'.
+    """
+
+    def __init__(self, loss_layer, agg='sum', name='val_loss', **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.loss_layer = loss_layer
+        if agg == 'sum':
+            self.agg = op.sum
+        elif agg == 'mean':
+            self.agg = op.mean
+        else:
+            raise ValueError(f'agg must be sum or mean, got {agg}')
+        num_replicas = loss_layer.output.shape[0]
+        self.per_replica_losses = self.add_weight(
+            name="per_replica_losses", shape=(num_replicas,), initializer="zeros"
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        self.per_replica_losses.assign(self.loss_layer(y_pred))
+
+    def result(self):
+        return self.agg(self.per_replica_losses)
+
+    def reset_state(self):
+        self.per_replica_losses.assign(tf.zeros_like(self.per_replica_losses))
diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -145,15 +145,16 @@ def fit_future_tests(n3pdfs=None, experimental_models=None, **_kwargs):
             # Update the mask of the last_model so that its synced with this layer
             last_model.get_layer(layer.name).update_mask(layer.mask)
 
-        # Compute the loss with pdf errors
-        pdf_chi2 = exp_model.compute_losses()["loss"][0]
-
-        # And the loss of the best (most complete) fit
-        best_chi2 = last_model.compute_losses()["loss"][0]
-
-        # Now make this into a measure of the total loss
-        # for instance, any deviation from the "best" value is bad
-        total_loss += np.abs(best_chi2 - pdf_chi2)
+    # TODO Aron: replace compute_losses here, is this even ever called?
+    #        # Compute the loss with pdf errors
+    #        pdf_chi2 = exp_model.compute_losses()["loss"][0]
+    #
+    #        # And the loss of the best (most complete) fit
+    #        best_chi2 = last_model.compute_losses()["loss"][0]
+    #
+    #        # Now make this into a measure of the total loss
+    #        # for instance, any deviation from the "best" value is bad
+    #        total_loss += np.abs(best_chi2 - pdf_chi2)
 
     if compatibility_mode:
         set_eager(False)

diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
@@ -308,11 +308,12 @@ def _write_metadata_json(self, i, out_path):
             json.dump(json_dict, fs, indent=2, cls=SuperEncoder)
 
         log.info(
-            "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)",
+            "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f), at epoch %d.",
             self.replica_numbers[i],
             self.true_chi2[i],
             self.tr_chi2[i],
             self.vl_chi2[i],
+            self.stopping_object.e_best_chi2[i],
         )
 
     def _export_pdf_grid(self, i, out_path):
@@ -514,7 +515,6 @@ def evln2lha(evln, nf=6):
         - 2 * evln[8]
     ) / 120
 
-
     # if a heavy quark is not active at Q0 (the scale at which the output of the fit is stored),
     # keep the PDF values at 0.0 to prevent small negative values due to numerical instabilities