From f2a38af2e87ba634e29bf6df664fce0497bd2bb2 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 15:44:44 +0200 Subject: [PATCH 01/29] Add best_epochs and positivity_statusses to Stopping (and run black) --- n3fit/src/n3fit/stopping.py | 58 ++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index d08a0e04ef..2786df4876 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -28,6 +28,7 @@ be used instead. """ import logging + import numpy as np log = logging.getLogger(__name__) @@ -213,27 +214,27 @@ def all_vl_chi2(self): return self._vl_dict def all_tr_chi2_for_replica(self, r): - """" Return the tr chi2 per dataset for a given replica """ + """ " Return the tr chi2 per dataset for a given replica""" return {k: np.take(i, r) for k, i in self.all_tr_chi2.items()} def all_vl_chi2_for_replica(self, r): - """" Return the vl chi2 per dataset for a given replica """ + """ " Return the vl chi2 per dataset for a given replica""" return {k: np.take(i, r) for k, i in self.all_vl_chi2.items()} def total_partial_tr_chi2(self): - """ Return the tr chi2 summed over replicas per experiment""" + """Return the tr chi2 summed over replicas per experiment""" return {k: np.sum(i) for k, i in self.all_tr_chi2.items()} def total_partial_vl_chi2(self): - """ Return the vl chi2 summed over replicas per experiment""" + """Return the vl chi2 summed over replicas per experiment""" return {k: np.sum(i) for k, i in self.all_tr_chi2.items()} def total_tr_chi2(self): - """ Return the total tr chi2 summed over replicas """ + """Return the total tr chi2 summed over replicas""" return np.sum(self.tr_chi2) def total_vl_chi2(self): - """ Return the total vl chi2 summed over replicas """ + """Return the total vl chi2 summed over replicas""" return np.sum(self.vl_chi2) def __str__(self): @@ -252,7 +253,7 @@ def __init__(self, pdf_model): self._best_vl_chi2 = INITIAL_CHI2 def positivity_pass(self): - """ By definition, if we have a ``best_epoch`` then positivity passed """ + """By definition, if we have a ``best_epoch`` then positivity passed""" if self._best_epoch is None: return False else: @@ -280,18 +281,18 @@ def positivity_status(self): return POS_BAD def register_best(self, chi2, epoch): - """ Register a new best state and some metadata about it """ + """Register a new best state and some metadata about it""" self._weights = self._pdf_model.get_weights() self._best_epoch = epoch self._best_vl_chi2 = chi2 def reload(self): - """ Reload the weights of the best state """ + """Reload the weights of the best state""" if self._weights: self._pdf_model.set_weights(self._weights) - def stop_training(self, epoch = None): - """ Stop training this replica if not stopped before """ + def stop_training(self, epoch=None): + """Stop training this replica if not stopped before""" if self._pdf_model.trainable: self._pdf_model.trainable = False self._stop_epoch = epoch @@ -309,8 +310,8 @@ class FitHistory: Parameters ---------- - pdf_models: n3fit.backends.MetaModel - list of PDF models being trained, used to saved the weights + pdf_model: n3fit.backends.MetaModel + PDF model being trained, used to saved the weights """ def __init__(self, pdf_models, tr_ndata, vl_ndata): @@ -337,11 +338,11 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata): @property def best_epoch(self): - """ Return the best epoch per replica """ + """Return the best epoch per replica""" return [i.best_epoch for i in self._replicas] def get_state(self, epoch): - """ Get the FitState of the system for a given epoch """ + """Get the FitState of the system for a given epoch""" try: return self._history[epoch] except IndexError as e: @@ -360,11 +361,11 @@ def save_best_replica(self, i, epoch=None): self._replicas[i].register_best(loss, epoch) def all_positivity_status(self): - """ Returns whether the positivity passed or not per replica """ + """Returns whether the positivity passed or not per replica""" return np.array([i.positivity_status for i in self._replicas]) def all_best_vl_loss(self): - """ Returns the best validation loss for each replica """ + """Returns the best validation loss for each replica""" return np.array([i.best_vl for i in self._replicas]) def register(self, epoch, training_info, validation_info): @@ -385,7 +386,7 @@ def register(self, epoch, training_info, validation_info): return fitstate def stop_training_replica(self, i, e): - """ Stop training replica i in epoch e""" + """Stop training replica i in epoch e""" self._replicas[i].stop_training(e) def reload(self): @@ -415,8 +416,8 @@ class Stopping: all_data_dict: dict list containg all dictionaries containing all information about the experiments/validation/regularizers/etc to be parsed by Stopping - pdf_models: list(n3fit.backends.MetaModel) - list of pdf_models being trained + pdf_model: n3fit.backends.MetaModel + the pdf model being trained threshold_positivity: float maximum value allowed for the sum of all positivity losses total_epochs: int @@ -431,7 +432,7 @@ def __init__( self, validation_model, all_data_dicts, - pdf_models, + pdf_model, threshold_positivity=THRESHOLD_POS, total_epochs=0, stopping_patience=7000, @@ -443,13 +444,13 @@ def __init__( # Create the History object tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts) - self._history = FitHistory(pdf_models, tr_ndata, vl_ndata) + self._history = FitHistory(pdf_model, tr_ndata, vl_ndata) # And the positivity checker self._positivity = Positivity(threshold_positivity, pos_sets) # Initialize internal variables for the stopping - self.n_replicas = len(pdf_models) + self.n_replicas = pdf_model.output_shape[-1] self.threshold_chi2 = threshold_chi2 self.stopping_degree = np.zeros(self.n_replicas, dtype=int) self.count = np.zeros(self.n_replicas, dtype=int) @@ -459,21 +460,24 @@ def __init__( self.stopping_patience = stopping_patience self.total_epochs = total_epochs + self.best_epochs = np.zeros(self.n_replicas, dtype=int) + self.positivity_statusses = np.repeat(POS_BAD, self.n_replicas) + @property def vl_chi2(self): - """ Current validation chi2 """ + """Current validation chi2""" validation_info = self._validation.compute_losses() fitstate = FitState(None, validation_info) return fitstate.vl_chi2 @property def e_best_chi2(self): - """ Epoch of the best chi2, if there is no best epoch, return last""" + """Epoch of the best chi2, if there is no best epoch, return last""" return self._history.best_epoch @property def stop_epoch(self): - """ Epoch in which the fit is stopped """ + """Epoch in which the fit is stopped""" return self._history.final_epoch + 1 @property @@ -602,7 +606,7 @@ def stop_here(self): return self.stop_now def get_next_replica(self): - """ Return the next ReplicaState object""" + """Return the next ReplicaState object""" return next(self._history) def chi2exps_json(self, replica=0, log_each=100): From a514bf19a8af8ceacc873e1f2700af20fbd30283 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 19:24:15 +0200 Subject: [PATCH 02/29] Clean up Writer --- n3fit/src/n3fit/io/writer.py | 323 +++++++++++++++++++++++----------- n3fit/src/n3fit/performfit.py | 53 ++---- 2 files changed, 228 insertions(+), 148 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 6b4879f5a8..fc4d24ffdb 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -4,13 +4,15 @@ The goal is to generate the same folder/file structure as the old nnfit code so previously active scripts can still work. """ -import os import json +import os + import numpy as np -from reportengine.compat import yaml -import validphys + import n3fit from n3fit import vpinterface +from reportengine.compat import yaml +import validphys XGRID = np.array( [ @@ -213,95 +215,124 @@ ] ) + class WriterWrapper: - def __init__(self, replica_number, pdf_object, stopping_object, q2, timings): + def __init__(self, replica_numbers, pdf_objects, stopping_object, all_chi2s, q2, timings): """ - Initializes the writer for one given replica. This is decoupled from the writing - of the fit in order to fix some of the variables which would be, in principle, - be shared by several different history objects. + Initializes the writer for all replicas. + + This is decoupled from the writing of the fit in order to fix some of the variables + which would be, in principle, be shared by several different history objects. Parameters ---------- - `replica_number` - index of the replica - `pdf_object` + `replica_numbers` + indices of the replicas + `pdf_objects` function to evaluate with a grid in x to generate a pdf `stopping_object` a stopping.Stopping object + `all_chi2s` + list of all the chi2s, in the order: tr_chi2, vl_chi2, true_chi2 `q2` q^2 of the fit `timings` dictionary of the timing of the different events that happened """ - self.replica_number = replica_number - self.pdf_object = pdf_object + self.replica_numbers = replica_numbers + self.pdf_objects = pdf_objects self.stopping_object = stopping_object self.q2 = q2 self.timings = timings + self.tr_chi2, self.vl_chi2, self.true_chi2 = all_chi2s - def write_data(self, replica_path_set, fitname, tr_chi2, vl_chi2, true_chi2): + def write_data(self, save_path, fitname, weights_name): """ - Wrapper around the `storefit` function. + Save all the data of a fit, for all replicas. Parameters ---------- - `replica_path_set` - full path for the replica, ex: `${PWD}/runcard_name/nnfit/replica_1` + `save_path` + path for the replica, ex: `${PWD}/runcard_name/nnfit` `fitname` - name of the fit - `tr_chi2` - training chi2 - `vl_chi2` - validation chi2 - `true_chi2` - chi2 of the replica to the central experimental data + name of the fit, ex: `Basic_runcard` + `weights_name` + name of the file to save weights to, if not empty """ - # Check the directory exist, if it doesn't, generate it os.makedirs(replica_path_set, exist_ok=True) - stop_epoch = self.stopping_object.stop_epoch + for i in range(len(self.replica_numbers)): + replica_path = f"{save_path}/replica_{self.replica_numbers[i]}" + self._write_chi2(f"{replica_path}/chi2exps.log") + self._write_metadata_json(i, f"{replica_path}/{fitname}.json") + self._export_pdf_grid(i, f"{replica_path}/{fitname}.exportgrid") + if weights_name: + self._write_weights(i, f"{replica_path}/{weights_name}") - # Get the replica status for this object - replica_status = self.stopping_object.get_next_replica() + def _write_chi2s(self, out_path): + # Note: same for all replicas, unless run separately + chi2_log = self.stopping_object.chi2exps_json() + with out_path.open("w", encoding="utf-8") as fs: + json.dump(chi2_log, fs, indent=2, cls=SuperEncoder) - # export PDF grid to file - storefit( - self.pdf_object, - self.replica_number, - replica_path_set, - fitname, - self.q2, + def _write_metadata_json(self, i, out_path): + json_dict = jsonfit( + best_epoch=self.stopping_object.best_epochs[i], + positivity_status=self.stopping_object.positivity_statusses[i], + pdf_object=self.pdf_objects[i], + tr_chi2=self.tr_chi2s[i], + vl_chi2=self.vl_chi2s[i], + true_chi2=self.true_chi2s[i], + # Note: last 2 same for all replicas, unless run separately + timings=self.timings, + stop_epoch=self.stopping_object.stop_epoch, ) - # write the log file for the chi2 - chi2_log = self.stopping_object.chi2exps_json() - with (replica_path_set / "chi2exps.log").open("w", encoding="utf-8") as fs: - json.dump(chi2_log, fs, indent=2, cls = SuperEncoder) + with open(out_path, "w", encoding="utf-8") as fs: + json.dump(json_dict, fs, indent=2, cls=SuperEncoder) - # export all metadata from the fit to a single yaml file - output_file = f"{replica_path_set}/{fitname}.json" - json_dict = jsonfit( - replica_status, self.pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, self.timings + log.info( + "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)", + self.replica_numbers[i], + self.true_chi2s[i], + self.tr_chi2s[i], + self.vl_chi2s[i], ) - with open(output_file, "w", encoding="utf-8") as fs: - json.dump(json_dict, fs, indent=2, cls = SuperEncoder) + + def _export_pdf_grid(self, i, out_path): + storefit( + self.pdf_objects[i], + self.replica_numbers[i], + out_path, + self.q2, + ) + + def _write_weights(self, i, out_path): + log.info(" > Saving the weights for future in %s", out_path) + # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3) + self.pdf_objects[i].save_weights(str(out_path), save_format="h5") class SuperEncoder(json.JSONEncoder): - """ Custom json encoder to get around the fact that np.float32 =/= float """ + """Custom json encoder to get around the fact that np.float32 =/= float""" + def default(self, o): if isinstance(o, np.float32): return float(o) return super().default(o) -def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, timing): +def jsonfit( + best_epoch, positivity_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, timing +): """Generates a dictionary containing all relevant metadata for the fit Parameters ---------- - replica_status: n3fit.stopping.ReplicaBest - a stopping.Validation object + best_epoch: int + epoch at which the best fit was found + positivity_status: str + string describing the positivity status of the fit pdf_object: n3fit.vpinterface.N3PDF N3PDF object constructed from the pdf_model that receives as input a point in x and returns an array of 14 flavours @@ -321,11 +352,11 @@ def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, all_info["preprocessing"] = pdf_object.get_preprocessing_factors() # .fitinfo-like info all_info["stop_epoch"] = stop_epoch - all_info["best_epoch"] = replica_status.best_epoch + all_info["best_epoch"] = best_epoch all_info["erf_tr"] = tr_chi2 all_info["erf_vl"] = vl_chi2 all_info["chi2"] = true_chi2 - all_info["pos_state"] = replica_status.positivity_status + all_info["pos_state"] = positivity_status all_info["arc_lengths"] = vpinterface.compute_arclength(pdf_object).tolist() all_info["integrability"] = vpinterface.integrability_numbers(pdf_object).tolist() all_info["timing"] = timing @@ -335,7 +366,7 @@ def jsonfit(replica_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, def version(): - """ Generates a dictionary with misc version info for this run """ + """Generates a dictionary with misc version info for this run""" versions = {} try: # Wrap tf in try-except block as it could possible to run n3fit without tf @@ -373,61 +404,128 @@ def evln2lha(evln): lha[6] = evln[2] - lha[8] = ( 10*evln[1] - + 30*evln[9] + 10*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - + 10*evln[3] + 30*evln[4] + 10*evln[5] + 5*evln[6] + 3*evln[7] + 2*evln[8] ) / 120 - - lha[4] = ( 10*evln[1] - + 30*evln[9] + 10*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - - 10*evln[3] - 30*evln[4] - 10*evln[5] - 5*evln[6] - 3*evln[7] - 2*evln[8] ) / 120 - - lha[7] = ( 10*evln[1] - - 30*evln[9] + 10*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - + 10*evln[3] - 30*evln[4] + 10*evln[5] + 5*evln[6] + 3*evln[7] + 2*evln[8] ) / 120 - - lha[5] = ( 10*evln[1] - - 30*evln[9] + 10*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - - 10*evln[3] + 30*evln[4] - 10*evln[5] - 5*evln[6] - 3*evln[7] - 2*evln[8] ) / 120 - - lha[9] = ( 10*evln[1] - - 20*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - + 10*evln[3] - 20*evln[5] + 5*evln[6] + 3*evln[7] + 2*evln[8] ) / 120 - - lha[3] = ( 10*evln[1] - - 20*evln[10] + 5*evln[11] + 3*evln[12] + 2*evln[13] - - 10*evln[3] + 20*evln[5] - 5*evln[6] - 3*evln[7] - 2*evln[8] ) / 120 - - lha[10] = ( 10*evln[1] - - 15*evln[11] + 3*evln[12] + 2*evln[13] - + 10*evln[3] - 15*evln[6] + 3*evln[7] + 2*evln[8] ) / 120 - - lha[2] = ( 10*evln[1] - - 15*evln[11] + 3*evln[12] + 2*evln[13] - - 10*evln[3] + 15*evln[6] - 3*evln[7] - 2*evln[8] ) / 120 - - lha[11] = ( 5*evln[1] - - 6*evln[12] + evln[13] - + 5*evln[3] - 6*evln[7] + evln[8] ) / 60 - - lha[1] = ( 5*evln[1] - - 6*evln[12] + evln[13] - - 5*evln[3] + 6*evln[7] - evln[8] ) / 60 - - lha[12] = ( evln[1] - - evln[13] - + evln[3] - evln[8] ) / 12 - - lha[0] = ( evln[1] - - evln[13] - - evln[3] + evln[8] ) / 12 + lha[8] = ( + 10 * evln[1] + + 30 * evln[9] + + 10 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + + 10 * evln[3] + + 30 * evln[4] + + 10 * evln[5] + + 5 * evln[6] + + 3 * evln[7] + + 2 * evln[8] + ) / 120 + + lha[4] = ( + 10 * evln[1] + + 30 * evln[9] + + 10 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + - 10 * evln[3] + - 30 * evln[4] + - 10 * evln[5] + - 5 * evln[6] + - 3 * evln[7] + - 2 * evln[8] + ) / 120 + + lha[7] = ( + 10 * evln[1] + - 30 * evln[9] + + 10 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + + 10 * evln[3] + - 30 * evln[4] + + 10 * evln[5] + + 5 * evln[6] + + 3 * evln[7] + + 2 * evln[8] + ) / 120 + + lha[5] = ( + 10 * evln[1] + - 30 * evln[9] + + 10 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + - 10 * evln[3] + + 30 * evln[4] + - 10 * evln[5] + - 5 * evln[6] + - 3 * evln[7] + - 2 * evln[8] + ) / 120 + + lha[9] = ( + 10 * evln[1] + - 20 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + + 10 * evln[3] + - 20 * evln[5] + + 5 * evln[6] + + 3 * evln[7] + + 2 * evln[8] + ) / 120 + + lha[3] = ( + 10 * evln[1] + - 20 * evln[10] + + 5 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + - 10 * evln[3] + + 20 * evln[5] + - 5 * evln[6] + - 3 * evln[7] + - 2 * evln[8] + ) / 120 + + lha[10] = ( + 10 * evln[1] + - 15 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + + 10 * evln[3] + - 15 * evln[6] + + 3 * evln[7] + + 2 * evln[8] + ) / 120 + + lha[2] = ( + 10 * evln[1] + - 15 * evln[11] + + 3 * evln[12] + + 2 * evln[13] + - 10 * evln[3] + + 15 * evln[6] + - 3 * evln[7] + - 2 * evln[8] + ) / 120 + + lha[11] = (5 * evln[1] - 6 * evln[12] + evln[13] + 5 * evln[3] - 6 * evln[7] + evln[8]) / 60 + + lha[1] = (5 * evln[1] - 6 * evln[12] + evln[13] - 5 * evln[3] + 6 * evln[7] - evln[8]) / 60 + + lha[12] = (evln[1] - evln[13] + evln[3] - evln[8]) / 12 + + lha[0] = (evln[1] - evln[13] - evln[3] + evln[8]) / 12 return lha def storefit( pdf_object, replica, - replica_path, - fitname, + out_path, q20, ): """ @@ -441,16 +539,12 @@ def storefit( that receives as input a point in x and returns an array of 14 flavours `replica` the replica index - `replica_path` - path for this replica - `fitname` - name of the fit `q20` q_0^2 """ # build exportgrid xgrid = XGRID.reshape(-1, 1) - + result = pdf_object(xgrid, flavours="n3fit").squeeze() lha = evln2lha(result.T).T @@ -458,9 +552,24 @@ def storefit( "replica": replica, "q20": q20, "xgrid": xgrid.T.tolist()[0], - "labels": ["TBAR", "BBAR", "CBAR", "SBAR", "UBAR", "DBAR", "GLUON", "D", "U", "S", "C", "B", "T", "PHT"], + "labels": [ + "TBAR", + "BBAR", + "CBAR", + "SBAR", + "UBAR", + "DBAR", + "GLUON", + "D", + "U", + "S", + "C", + "B", + "T", + "PHT", + ], "pdfgrid": lha.tolist(), } - with open(f"{replica_path}/{fitname}.exportgrid", "w") as fs: + with open(out_path, "w") as fs: yaml.dump(data, fs) diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 431bb1f5a6..687cafa1bf 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -264,49 +264,20 @@ def performfit( log.info("Stopped at epoch=%d", stopping_object.stop_epoch) final_time = stopwatch.stop() - all_training_chi2, all_val_chi2, all_exp_chi2 = the_model_trainer.evaluate(stopping_object) + all_chi2s = the_model_trainer.evaluate(stopping_object) pdf_models = result["pdf_models"] - for i, (replica_number, pdf_model) in enumerate(zip(replica_idxs, pdf_models)): - # Each model goes into its own replica folder - replica_path_set = replica_path / f"replica_{replica_number}" - - # Create a pdf instance - q0 = theoryid.get_description().get("Q0") - pdf_instance = N3PDF(pdf_model, fit_basis=basis, Q=q0) - - # Generate the writer wrapper - writer_wrapper = WriterWrapper( - replica_number, - pdf_instance, - stopping_object, - q0**2, - final_time, - ) - - # Get the right chi2s - training_chi2 = np.take(all_training_chi2, i) - val_chi2 = np.take(all_val_chi2, i) - exp_chi2 = np.take(all_exp_chi2, i) - - # And write the data down - writer_wrapper.write_data( - replica_path_set, output_path.name, training_chi2, val_chi2, exp_chi2 - ) - log.info( - "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)", - replica_number, - exp_chi2, - training_chi2, - val_chi2, - ) - - # Save the weights to some file for the given replica - if save: - model_file_path = replica_path_set / save - log.info(" > Saving the weights for future in %s", model_file_path) - # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3) - pdf_model.save_weights(str(model_file_path), save_format="h5") + q0 = theoryid.get_description().get("Q0") + pdf_instances = [N3PDF(pdf_model, fit_basis=basis, Q=q0) for pdf_model in pdf_models] + writer_wrapper = WriterWrapper( + replica_idxs, + pdf_instances, + stopping_object, + all_chi2s, + q0**2, + final_time, + ) + writer_wrapper.write_data(replica_path, output_path.name) if tensorboard is not None: log.info("Tensorboard logging information is stored at %s", log_path) From df0cc8fcb45da69e0573961814bf9ff8dde6b1e9 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 19:34:11 +0200 Subject: [PATCH 03/29] Undo assumed changes to model structure --- n3fit/src/n3fit/stopping.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 2786df4876..51d31babaf 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -214,11 +214,11 @@ def all_vl_chi2(self): return self._vl_dict def all_tr_chi2_for_replica(self, r): - """ " Return the tr chi2 per dataset for a given replica""" + """Return the tr chi2 per dataset for a given replica""" return {k: np.take(i, r) for k, i in self.all_tr_chi2.items()} def all_vl_chi2_for_replica(self, r): - """ " Return the vl chi2 per dataset for a given replica""" + """Return the vl chi2 per dataset for a given replica""" return {k: np.take(i, r) for k, i in self.all_vl_chi2.items()} def total_partial_tr_chi2(self): @@ -310,8 +310,8 @@ class FitHistory: Parameters ---------- - pdf_model: n3fit.backends.MetaModel - PDF model being trained, used to saved the weights + pdf_models: n3fit.backends.MetaModel + list of PDF models being trained, used to saved the weights """ def __init__(self, pdf_models, tr_ndata, vl_ndata): @@ -416,8 +416,8 @@ class Stopping: all_data_dict: dict list containg all dictionaries containing all information about the experiments/validation/regularizers/etc to be parsed by Stopping - pdf_model: n3fit.backends.MetaModel - the pdf model being trained + pdf_models: list(n3fit.backends.MetaModel) + list of pdf_models being trained threshold_positivity: float maximum value allowed for the sum of all positivity losses total_epochs: int @@ -432,7 +432,7 @@ def __init__( self, validation_model, all_data_dicts, - pdf_model, + pdf_models, threshold_positivity=THRESHOLD_POS, total_epochs=0, stopping_patience=7000, @@ -444,13 +444,13 @@ def __init__( # Create the History object tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts) - self._history = FitHistory(pdf_model, tr_ndata, vl_ndata) + self._history = FitHistory(pdf_models, tr_ndata, vl_ndata) # And the positivity checker self._positivity = Positivity(threshold_positivity, pos_sets) # Initialize internal variables for the stopping - self.n_replicas = pdf_model.output_shape[-1] + self.n_replicas = len(pdf_models) self.threshold_chi2 = threshold_chi2 self.stopping_degree = np.zeros(self.n_replicas, dtype=int) self.count = np.zeros(self.n_replicas, dtype=int) From 0c11088bea6753bbceb187e32cb5f1410d95b9bf Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 19:56:01 +0200 Subject: [PATCH 04/29] various fixes --- n3fit/src/n3fit/io/writer.py | 31 ++++++++++++++++++++----------- n3fit/src/n3fit/performfit.py | 2 +- n3fit/src/n3fit/stopping.py | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index fc4d24ffdb..64b3b3424a 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -5,6 +5,7 @@ so previously active scripts can still work. """ import json +import logging import os import numpy as np @@ -14,6 +15,8 @@ from reportengine.compat import yaml import validphys +log = logging.getLogger(__name__) + XGRID = np.array( [ 1.00000000000000e-09, @@ -259,11 +262,13 @@ def write_data(self, save_path, fitname, weights_name): `weights_name` name of the file to save weights to, if not empty """ - os.makedirs(replica_path_set, exist_ok=True) + os.makedirs(save_path, exist_ok=True) for i in range(len(self.replica_numbers)): replica_path = f"{save_path}/replica_{self.replica_numbers[i]}" - self._write_chi2(f"{replica_path}/chi2exps.log") + os.makedirs(replica_path, exist_ok=True) + + self._write_chi2s(f"{replica_path}/chi2exps.log") self._write_metadata_json(i, f"{replica_path}/{fitname}.json") self._export_pdf_grid(i, f"{replica_path}/{fitname}.exportgrid") if weights_name: @@ -272,7 +277,7 @@ def write_data(self, save_path, fitname, weights_name): def _write_chi2s(self, out_path): # Note: same for all replicas, unless run separately chi2_log = self.stopping_object.chi2exps_json() - with out_path.open("w", encoding="utf-8") as fs: + with open(out_path, "w", encoding="utf-8") as fs: json.dump(chi2_log, fs, indent=2, cls=SuperEncoder) def _write_metadata_json(self, i, out_path): @@ -280,11 +285,11 @@ def _write_metadata_json(self, i, out_path): best_epoch=self.stopping_object.best_epochs[i], positivity_status=self.stopping_object.positivity_statusses[i], pdf_object=self.pdf_objects[i], - tr_chi2=self.tr_chi2s[i], - vl_chi2=self.vl_chi2s[i], - true_chi2=self.true_chi2s[i], + tr_chi2=self.tr_chi2[i], + vl_chi2=self.vl_chi2[i], + true_chi2=self.true_chi2[i], # Note: last 2 same for all replicas, unless run separately - timings=self.timings, + timing=self.timings, stop_epoch=self.stopping_object.stop_epoch, ) @@ -294,9 +299,9 @@ def _write_metadata_json(self, i, out_path): log.info( "Best fit for replica #%d, chi2=%.3f (tr=%.3f, vl=%.3f)", self.replica_numbers[i], - self.true_chi2s[i], - self.tr_chi2s[i], - self.vl_chi2s[i], + self.true_chi2[i], + self.tr_chi2[i], + self.vl_chi2[i], ) def _export_pdf_grid(self, i, out_path): @@ -309,8 +314,10 @@ def _export_pdf_grid(self, i, out_path): def _write_weights(self, i, out_path): log.info(" > Saving the weights for future in %s", out_path) + # Extract model out of N3PDF + model = self.pdf_objects[i]._models[0] # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3) - self.pdf_objects[i].save_weights(str(out_path), save_format="h5") + model.save_weights(str(out_path), save_format="h5") class SuperEncoder(json.JSONEncoder): @@ -539,6 +546,8 @@ def storefit( that receives as input a point in x and returns an array of 14 flavours `replica` the replica index + `out_path` + the path where to store the output `q20` q_0^2 """ diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py index 687cafa1bf..85c393c7df 100644 --- a/n3fit/src/n3fit/performfit.py +++ b/n3fit/src/n3fit/performfit.py @@ -277,7 +277,7 @@ def performfit( q0**2, final_time, ) - writer_wrapper.write_data(replica_path, output_path.name) + writer_wrapper.write_data(replica_path, output_path.name, save) if tensorboard is not None: log.info("Tensorboard logging information is stored at %s", log_path) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 51d31babaf..354a42877d 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -460,7 +460,7 @@ def __init__( self.stopping_patience = stopping_patience self.total_epochs = total_epochs - self.best_epochs = np.zeros(self.n_replicas, dtype=int) + self.best_epochs = [0] * self.n_replicas self.positivity_statusses = np.repeat(POS_BAD, self.n_replicas) @property From 5db1c67e4eef3541f502121717ce106218b38bff Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 20:28:26 +0200 Subject: [PATCH 05/29] Update best_epochs and positivity_statusses --- n3fit/src/n3fit/stopping.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 354a42877d..6e67c1b99c 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -556,6 +556,8 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # Step 5. loop over the valid indices to check whether the vl improved for i in np.where(passes)[0]: + self.best_epochs[i] = epoch + self.positivity_statusses[i] = POS_OK self._history.save_best_replica(i) self.stopping_degree[i] = 0 self.count[i] = 1 From 1a7de34edb8f7286b588d1227412927b43b3fd70 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 14 Aug 2023 20:34:34 +0200 Subject: [PATCH 06/29] Remove now unused iterators over replicas --- n3fit/src/n3fit/stopping.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 6e67c1b99c..8cfa57c7b0 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -304,10 +304,6 @@ class FitHistory: It also keeps track of the best epoch and the associated weights. - Can be iterated when there are snapshots of the fit being saved. - When iterated it will rewind the fit to each of the point in history - that have been saved. - Parameters ---------- pdf_models: n3fit.backends.MetaModel @@ -320,7 +316,6 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata): self._replicas = [] for pdf_model in pdf_models: self._replicas.append(ReplicaState(pdf_model)) - self._iter_replicas = iter(self._replicas) if vl_ndata is None: vl_ndata = tr_ndata @@ -397,9 +392,6 @@ def reload(self): replica.stop_training(self.final_epoch) replica.reload() - def __next__(self): - return next(self._iter_replicas) - class Stopping: """ @@ -607,10 +599,6 @@ def stop_here(self): else: return self.stop_now - def get_next_replica(self): - """Return the next ReplicaState object""" - return next(self._history) - def chi2exps_json(self, replica=0, log_each=100): """ Returns and apt-for-json dictionary with the status of the fit every `log_each` epochs From 74bc818820137f3a7b26b41d981e929f1d3e0f4a Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 10:21:58 +0200 Subject: [PATCH 07/29] Remove now unused positivity_pass and positivity_status from ReplicaState --- n3fit/src/n3fit/stopping.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 8cfa57c7b0..351bfce848 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -252,13 +252,6 @@ def __init__(self, pdf_model): self._stop_epoch = None self._best_vl_chi2 = INITIAL_CHI2 - def positivity_pass(self): - """By definition, if we have a ``best_epoch`` then positivity passed""" - if self._best_epoch is None: - return False - else: - return True - @property def best_epoch(self): if self._best_epoch is None: @@ -273,13 +266,6 @@ def stop_epoch(self): def best_vl(self): return float(self._best_vl_chi2) - @property - def positivity_status(self): - if self.positivity_pass(): - return POS_OK - else: - return POS_BAD - def register_best(self, chi2, epoch): """Register a new best state and some metadata about it""" self._weights = self._pdf_model.get_weights() @@ -549,7 +535,9 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # Step 5. loop over the valid indices to check whether the vl improved for i in np.where(passes)[0]: self.best_epochs[i] = epoch + # By definition, if we have a ``best_epoch`` then positivity passed self.positivity_statusses[i] = POS_OK + self._history.save_best_replica(i) self.stopping_degree[i] = 0 self.count[i] = 1 From e04d33625ef328cb6450d4535585ef64d21088a3 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 10:46:10 +0200 Subject: [PATCH 08/29] Move all best and stop_epoch to Stopping, remove trainable=False --- n3fit/src/n3fit/stopping.py | 42 ++++++++----------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 351bfce848..36968f13f2 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -248,20 +248,8 @@ class ReplicaState: def __init__(self, pdf_model): self._pdf_model = pdf_model self._weights = None - self._best_epoch = None - self._stop_epoch = None self._best_vl_chi2 = INITIAL_CHI2 - @property - def best_epoch(self): - if self._best_epoch is None: - return self.stop_epoch - return self._best_epoch - - @property - def stop_epoch(self): - return self._stop_epoch - @property def best_vl(self): return float(self._best_vl_chi2) @@ -269,7 +257,6 @@ def best_vl(self): def register_best(self, chi2, epoch): """Register a new best state and some metadata about it""" self._weights = self._pdf_model.get_weights() - self._best_epoch = epoch self._best_vl_chi2 = chi2 def reload(self): @@ -277,12 +264,6 @@ def reload(self): if self._weights: self._pdf_model.set_weights(self._weights) - def stop_training(self, epoch=None): - """Stop training this replica if not stopped before""" - if self._pdf_model.trainable: - self._pdf_model.trainable = False - self._stop_epoch = epoch - class FitHistory: """ @@ -317,11 +298,6 @@ def __init__(self, pdf_models, tr_ndata, vl_ndata): self._history = [] self.final_epoch = None - @property - def best_epoch(self): - """Return the best epoch per replica""" - return [i.best_epoch for i in self._replicas] - def get_state(self, epoch): """Get the FitState of the system for a given epoch""" try: @@ -366,16 +342,11 @@ def register(self, epoch, training_info, validation_info): self._history.append(fitstate) return fitstate - def stop_training_replica(self, i, e): - """Stop training replica i in epoch e""" - self._replicas[i].stop_training(e) - def reload(self): """Reloads the best fit weights into the model if there are models to be reloaded Ensure that all replicas have stopped at this point. """ for replica in self._replicas: - replica.stop_training(self.final_epoch) replica.reload() @@ -438,8 +409,9 @@ def __init__( self.stopping_patience = stopping_patience self.total_epochs = total_epochs - self.best_epochs = [0] * self.n_replicas - self.positivity_statusses = np.repeat(POS_BAD, self.n_replicas) + self.stop_epochs = [None] * self.n_replicas + self.best_epochs = [None] * self.n_replicas + self.positivity_statusses = [POS_BAD] * self.n_replicas @property def vl_chi2(self): @@ -451,7 +423,11 @@ def vl_chi2(self): @property def e_best_chi2(self): """Epoch of the best chi2, if there is no best epoch, return last""" - return self._history.best_epoch + best_or_last_epochs = [ + best if best is not None else last + for best, last in zip(self.best_epochs, self.stop_epochs) + ] + return best_or_last_epochs @property def stop_epoch(self): @@ -544,8 +520,8 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): stop_replicas = self.count & (self.stopping_degree > self.stopping_patience) for i in np.where(stop_replicas)[0]: + self.stop_epochs[i] = epoch self.count[i] = 0 - self._history.stop_training_replica(i, epoch) # By using the stopping degree we only stop when none of the replicas are improving anymore if min(self.stopping_degree) > self.stopping_patience: From 984d4fd1f8401270c7d4544785502f8e08a1cc75 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 10:49:58 +0200 Subject: [PATCH 09/29] Remove all_positivity_status --- n3fit/src/n3fit/stopping.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 36968f13f2..0ddae801c6 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -317,10 +317,6 @@ def save_best_replica(self, i, epoch=None): loss = self.get_state(epoch).vl_loss[i] self._replicas[i].register_best(loss, epoch) - def all_positivity_status(self): - """Returns whether the positivity passed or not per replica""" - return np.array([i.positivity_status for i in self._replicas]) - def all_best_vl_loss(self): """Returns the best validation loss for each replica""" return np.array([i.best_vl for i in self._replicas]) @@ -438,7 +434,7 @@ def stop_epoch(self): def positivity_status(self): """Returns POS_PASS if positivity passes or veto if it doesn't for each replica""" - return self._history.all_positivity_status() + return self.positivity_statusses def evaluate_training(self, training_model): """Given the training model, evaluates the From 57b3be1f1b505657f7bda7977f33e695e957c65a Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 10:59:47 +0200 Subject: [PATCH 10/29] Completely remove ReplicaState, move remaining functionality to FitHistory --- n3fit/src/n3fit/stopping.py | 42 +++++++------------------------------ 1 file changed, 8 insertions(+), 34 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 0ddae801c6..2a107606ea 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -241,30 +241,6 @@ def __str__(self): return f"chi2: tr={self.tr_chi2} vl={self.vl_chi2}" -class ReplicaState: - """Extra complication which eventually will be merged with someone else - but it is here only for development.""" - - def __init__(self, pdf_model): - self._pdf_model = pdf_model - self._weights = None - self._best_vl_chi2 = INITIAL_CHI2 - - @property - def best_vl(self): - return float(self._best_vl_chi2) - - def register_best(self, chi2, epoch): - """Register a new best state and some metadata about it""" - self._weights = self._pdf_model.get_weights() - self._best_vl_chi2 = chi2 - - def reload(self): - """Reload the weights of the best state""" - if self._weights: - self._pdf_model.set_weights(self._weights) - - class FitHistory: """ Keeps a list of FitState items holding the full history of the fit. @@ -278,11 +254,9 @@ class FitHistory: """ def __init__(self, pdf_models, tr_ndata, vl_ndata): - # Create a ReplicaState object for all models - # which will hold the best chi2 and weights per replica - self._replicas = [] - for pdf_model in pdf_models: - self._replicas.append(ReplicaState(pdf_model)) + self._replicas = pdf_models + self._best_weights = [None] * len(pdf_models) + self._best_val_chi2s = [INITIAL_CHI2] * len(pdf_models) if vl_ndata is None: vl_ndata = tr_ndata @@ -314,12 +288,12 @@ def save_best_replica(self, i, epoch=None): """ if epoch is None: epoch = self.final_epoch - loss = self.get_state(epoch).vl_loss[i] - self._replicas[i].register_best(loss, epoch) + self._best_val_chi2s[i] = self.get_state(epoch).vl_loss[i] + self._best_weights[i] = self._replicas[i].get_weights() def all_best_vl_loss(self): """Returns the best validation loss for each replica""" - return np.array([i.best_vl for i in self._replicas]) + return np.array(self._best_val_chi2s) def register(self, epoch, training_info, validation_info): """Save a new fitstate and updates the current final epoch @@ -342,8 +316,8 @@ def reload(self): """Reloads the best fit weights into the model if there are models to be reloaded Ensure that all replicas have stopped at this point. """ - for replica in self._replicas: - replica.reload() + for replica, weights in zip(self._replicas, self._best_weights): + replica.set_weights(weights) class Stopping: From 85e0061ac12de14b1c374b6da7fe7f1a5f30cd96 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 11:25:01 +0200 Subject: [PATCH 11/29] Uniformize notation --- n3fit/src/n3fit/stopping.py | 68 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 2a107606ea..f7953b6da4 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -164,8 +164,8 @@ def __init__(self, training_info, validation_info): self.training = training_info self.validation = validation_info self._parsed = False - self._vl_chi2 = None - self._tr_chi2 = None + self._vl_chi2 = None # These are per replica + self._tr_chi2 = None # This is an overall training chi2 self._vl_dict = None self._tr_dict = None @@ -213,21 +213,21 @@ def all_vl_chi2(self): self._parse_chi2() return self._vl_dict - def all_tr_chi2_for_replica(self, r): + def all_tr_chi2_for_replica(self, i_replica): """Return the tr chi2 per dataset for a given replica""" - return {k: np.take(i, r) for k, i in self.all_tr_chi2.items()} + return {k: np.take(v, i_replica) for k, v in self.all_tr_chi2.items()} - def all_vl_chi2_for_replica(self, r): + def all_vl_chi2_for_replica(self, i_replica): """Return the vl chi2 per dataset for a given replica""" - return {k: np.take(i, r) for k, i in self.all_vl_chi2.items()} + return {k: np.take(v, i_replica) for k, v in self.all_vl_chi2.items()} def total_partial_tr_chi2(self): """Return the tr chi2 summed over replicas per experiment""" - return {k: np.sum(i) for k, i in self.all_tr_chi2.items()} + return {k: np.sum(v) for k, v in self.all_tr_chi2.items()} def total_partial_vl_chi2(self): """Return the vl chi2 summed over replicas per experiment""" - return {k: np.sum(i) for k, i in self.all_tr_chi2.items()} + return {k: np.sum(v) for k, v in self.all_tr_chi2.items()} def total_tr_chi2(self): """Return the total tr chi2 summed over replicas""" @@ -281,15 +281,15 @@ def get_state(self, epoch): f"Tried to get obtain the state for epoch {epoch} when only {len(self._history)} epochs have been saved" ) from e - def save_best_replica(self, i, epoch=None): - """Save the state of replica ``i`` as a best fit so far. + def save_best_replica(self, i_replica, epoch=None): + """Save the state of replica ``i_replica`` as a best fit so far. If an epoch is given, save the best as the given epoch, otherwise use the last one """ if epoch is None: epoch = self.final_epoch - self._best_val_chi2s[i] = self.get_state(epoch).vl_loss[i] - self._best_weights[i] = self._replicas[i].get_weights() + self._best_val_chi2s[i_replica] = self.get_state(epoch).vl_loss[i_replica] + self._best_weights[i_replica] = self._replicas[i_replica].get_weights() def all_best_vl_loss(self): """Returns the best validation loss for each replica""" @@ -371,8 +371,8 @@ def __init__( # Initialize internal variables for the stopping self.n_replicas = len(pdf_models) self.threshold_chi2 = threshold_chi2 - self.stopping_degree = np.zeros(self.n_replicas, dtype=int) - self.count = np.zeros(self.n_replicas, dtype=int) + self.stopping_degrees = np.zeros(self.n_replicas, dtype=int) + self.counts = np.zeros(self.n_replicas, dtype=int) self.dont_stop = dont_stop self.stop_now = False @@ -471,30 +471,30 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # this means improving vl_chi2 and passing positivity # Don't start counting until the chi2 of the validation goes below a certain threshold # once we start counting, don't bother anymore - passes = self.count | (fitstate.vl_chi2 < self.threshold_chi2) + passes = self.counts | (fitstate.vl_chi2 < self.threshold_chi2) passes &= fitstate.vl_loss < self._history.all_best_vl_loss() # And the ones that pass positivity passes &= self._positivity(fitstate) - self.stopping_degree += self.count + self.stopping_degrees += self.counts # Step 5. loop over the valid indices to check whether the vl improved - for i in np.where(passes)[0]: - self.best_epochs[i] = epoch + for i_replica in np.where(passes)[0]: + self.best_epochs[i_replica] = epoch # By definition, if we have a ``best_epoch`` then positivity passed - self.positivity_statusses[i] = POS_OK + self.positivity_statusses[i_replica] = POS_OK - self._history.save_best_replica(i) - self.stopping_degree[i] = 0 - self.count[i] = 1 + self._history.save_best_replica(i_replica) + self.stopping_degrees[i_replica] = 0 + self.counts[i_replica] = 1 - stop_replicas = self.count & (self.stopping_degree > self.stopping_patience) - for i in np.where(stop_replicas)[0]: - self.stop_epochs[i] = epoch - self.count[i] = 0 + stop_replicas = self.counts & (self.stopping_degrees > self.stopping_patience) + for i_replica in np.where(stop_replicas)[0]: + self.stop_epochs[i_replica] = epoch + self.counts[i_replica] = 0 # By using the stopping degree we only stop when none of the replicas are improving anymore - if min(self.stopping_degree) > self.stopping_patience: + if min(self.stopping_degrees) > self.stopping_patience: self.make_stop() return True @@ -533,13 +533,13 @@ def stop_here(self): else: return self.stop_now - def chi2exps_json(self, replica=0, log_each=100): + def chi2exps_json(self, i_replica=0, log_each=100): """ Returns and apt-for-json dictionary with the status of the fit every `log_each` epochs Parameters ---------- - replica: int + i_replica: int which replica are we writing the log for log_each: int every how many epochs to print the log @@ -552,10 +552,10 @@ def chi2exps_json(self, replica=0, log_each=100): final_epoch = self._history.final_epoch json_dict = {} - for i in range(log_each - 1, final_epoch + 1, log_each): - fitstate = self._history.get_state(i) - all_tr = fitstate.all_tr_chi2_for_replica(replica) - all_vl = fitstate.all_vl_chi2_for_replica(replica) + for epoch in range(log_each - 1, final_epoch + 1, log_each): + fitstate = self._history.get_state(epoch) + all_tr = fitstate.all_tr_chi2_for_replica(i_replica) + all_vl = fitstate.all_vl_chi2_for_replica(i_replica) tmp = {exp: {"training": tr_chi2} for exp, tr_chi2 in all_tr.items()} for exp, vl_chi2 in all_vl.items(): @@ -563,7 +563,7 @@ def chi2exps_json(self, replica=0, log_each=100): tmp[exp] = {"training": None} tmp[exp]["validation"] = vl_chi2 - json_dict[i + 1] = tmp + json_dict[epoch + 1] = tmp return json_dict From 6c1229aec9e7a62d6150038a7efe08aadc62c652 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 11:30:59 +0200 Subject: [PATCH 12/29] Improve documentation --- n3fit/src/n3fit/stopping.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index f7953b6da4..dc037e7311 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -251,6 +251,10 @@ class FitHistory: ---------- pdf_models: n3fit.backends.MetaModel list of PDF models being trained, used to saved the weights + tr_ndata: dict + dictionary of {dataset: n_points} for the training data + vl_ndata: dict + dictionary of {dataset: n_points} for the validation data """ def __init__(self, pdf_models, tr_ndata, vl_ndata): From 2c4886911385a74527a90660b1f8bd735c0b2773 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 12:18:53 +0200 Subject: [PATCH 13/29] Move all but the losses from FitHistory to Stopping itself --- n3fit/src/n3fit/stopping.py | 47 ++++++++++++------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index dc037e7311..be780265f3 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -249,19 +249,13 @@ class FitHistory: Parameters ---------- - pdf_models: n3fit.backends.MetaModel - list of PDF models being trained, used to saved the weights tr_ndata: dict dictionary of {dataset: n_points} for the training data vl_ndata: dict dictionary of {dataset: n_points} for the validation data """ - def __init__(self, pdf_models, tr_ndata, vl_ndata): - self._replicas = pdf_models - self._best_weights = [None] * len(pdf_models) - self._best_val_chi2s = [INITIAL_CHI2] * len(pdf_models) - + def __init__(self, tr_ndata, vl_ndata): if vl_ndata is None: vl_ndata = tr_ndata vl_suffix = "loss" @@ -285,20 +279,6 @@ def get_state(self, epoch): f"Tried to get obtain the state for epoch {epoch} when only {len(self._history)} epochs have been saved" ) from e - def save_best_replica(self, i_replica, epoch=None): - """Save the state of replica ``i_replica`` as a best fit so far. - If an epoch is given, save the best as the given epoch, otherwise - use the last one - """ - if epoch is None: - epoch = self.final_epoch - self._best_val_chi2s[i_replica] = self.get_state(epoch).vl_loss[i_replica] - self._best_weights[i_replica] = self._replicas[i_replica].get_weights() - - def all_best_vl_loss(self): - """Returns the best validation loss for each replica""" - return np.array(self._best_val_chi2s) - def register(self, epoch, training_info, validation_info): """Save a new fitstate and updates the current final epoch @@ -316,13 +296,6 @@ def register(self, epoch, training_info, validation_info): self._history.append(fitstate) return fitstate - def reload(self): - """Reloads the best fit weights into the model if there are models to be reloaded - Ensure that all replicas have stopped at this point. - """ - for replica, weights in zip(self._replicas, self._best_weights): - replica.set_weights(weights) - class Stopping: """ @@ -362,12 +335,14 @@ def __init__( threshold_chi2=10.0, dont_stop=False, ): + self.pdf_models = pdf_models + # Save the validation object self._validation = validation_model # Create the History object tr_ndata, vl_ndata, pos_sets = parse_ndata(all_data_dicts) - self._history = FitHistory(pdf_models, tr_ndata, vl_ndata) + self._history = FitHistory(tr_ndata, vl_ndata) # And the positivity checker self._positivity = Positivity(threshold_positivity, pos_sets) @@ -386,6 +361,8 @@ def __init__( self.stop_epochs = [None] * self.n_replicas self.best_epochs = [None] * self.n_replicas self.positivity_statusses = [POS_BAD] * self.n_replicas + self._best_weights = [None] * self.n_replicas + self._best_val_chi2s = [INITIAL_CHI2] * self.n_replicas @property def vl_chi2(self): @@ -476,7 +453,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # Don't start counting until the chi2 of the validation goes below a certain threshold # once we start counting, don't bother anymore passes = self.counts | (fitstate.vl_chi2 < self.threshold_chi2) - passes &= fitstate.vl_loss < self._history.all_best_vl_loss() + passes &= fitstate.vl_loss < self._best_val_chi2s # And the ones that pass positivity passes &= self._positivity(fitstate) @@ -488,7 +465,9 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # By definition, if we have a ``best_epoch`` then positivity passed self.positivity_statusses[i_replica] = POS_OK - self._history.save_best_replica(i_replica) + self._best_val_chi2s[i_replica] = self._history.get_state(epoch).vl_loss[i_replica] + self._best_weights[i_replica] = self.pdf_models[i_replica].get_weights() + self.stopping_degrees[i_replica] = 0 self.counts[i_replica] = 1 @@ -507,7 +486,11 @@ def make_stop(self): and reload the history to the point of the best model if any """ self.stop_now = True - self._history.reload() + self._restore_best_weights() + + def _restore_best_weights(self): + for replica, weights in zip(self.pdf_models, self._best_weights): + replica.set_weights(weights) def print_current_stats(self, epoch, fitstate): """ From df0dae06a71577dcde33d2777a3d43789760ea9d Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 15 Aug 2023 12:49:15 +0200 Subject: [PATCH 14/29] Update documentation --- n3fit/src/n3fit/stopping.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index be780265f3..2e52a0cda3 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -134,11 +134,7 @@ def parse_losses(history_object, data, suffix="loss"): class FitState: """ - Holds the state of the chi2 during the fit for all replicas - - It holds the necessary information to reload the fit - to a specific point in time if we are interested on reloading - (otherwise the relevant variables stay empty to save memory) + Holds the state of the chi2 during the fit, for all replicas and one epoch Note: the training chi2 is computed before the update of the weights so it is the chi2 that informed the updated corresponding to this state. @@ -243,9 +239,7 @@ def __str__(self): class FitHistory: """ - Keeps a list of FitState items holding the full history of the fit. - - It also keeps track of the best epoch and the associated weights. + Keeps a list of FitState items holding the full chi2 history of the fit. Parameters ---------- @@ -284,11 +278,16 @@ def register(self, epoch, training_info, validation_info): Parameters ---------- - fitstate: FitState - FitState object - the fitstate of the object to save epoch: int the current epoch of the fit + training_info: dict + all losses for the training model + validation_info: dict + all losses for the validation model + + Returns + ------- + FitState """ # Save all the information in a fitstate object fitstate = FitState(training_info, validation_info) @@ -309,7 +308,7 @@ class Stopping: validation_model: n3fit.backends.MetaModel the model with the validation mask applied (and compiled with the validation data and covmat) - all_data_dict: dict + all_data_dicts: dict list containg all dictionaries containing all information about the experiments/validation/regularizers/etc to be parsed by Stopping pdf_models: list(n3fit.backends.MetaModel) @@ -320,6 +319,8 @@ class Stopping: total number of epochs stopping_patience: int how many epochs to wait for the validation loss to improve + threshold_chi2: float + maximum value allowed for chi2 dont_stop: bool dont care about early stopping """ From 635e6a0808ad13991f24920df91063c632a7ddd4 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 16 Aug 2023 13:56:16 +0200 Subject: [PATCH 15/29] Indicate private attributes --- n3fit/src/n3fit/stopping.py | 72 ++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 2e52a0cda3..848f57f2f8 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -157,7 +157,7 @@ def __init__(self, training_info, validation_info): raise ValueError( "FitState cannot be instantiated until vl_ndata, tr_ndata and vl_suffix are filled" ) - self.training = training_info + self._training = training_info self.validation = validation_info self._parsed = False self._vl_chi2 = None # These are per replica @@ -173,7 +173,7 @@ def vl_loss(self): @property def tr_loss(self): """Return the total validation loss as it comes from the info dictionaries""" - return self.training.get("loss") + return self._training.get("loss") def _parse_chi2(self): """ @@ -182,8 +182,8 @@ def _parse_chi2(self): """ if self._parsed: return - if self.training is not None: - self._tr_chi2, self._tr_dict = parse_losses(self.training, self.tr_ndata) + if self._training is not None: + self._tr_chi2, self._tr_dict = parse_losses(self._training, self.tr_ndata) if self.validation is not None: self._vl_chi2, self._vl_dict = parse_losses( self.validation, self.vl_ndata, suffix=self.vl_suffix @@ -336,7 +336,7 @@ def __init__( threshold_chi2=10.0, dont_stop=False, ): - self.pdf_models = pdf_models + self._pdf_models = pdf_models # Save the validation object self._validation = validation_model @@ -349,21 +349,21 @@ def __init__( self._positivity = Positivity(threshold_positivity, pos_sets) # Initialize internal variables for the stopping - self.n_replicas = len(pdf_models) - self.threshold_chi2 = threshold_chi2 - self.stopping_degrees = np.zeros(self.n_replicas, dtype=int) - self.counts = np.zeros(self.n_replicas, dtype=int) - - self.dont_stop = dont_stop - self.stop_now = False - self.stopping_patience = stopping_patience - self.total_epochs = total_epochs - - self.stop_epochs = [None] * self.n_replicas - self.best_epochs = [None] * self.n_replicas - self.positivity_statusses = [POS_BAD] * self.n_replicas - self._best_weights = [None] * self.n_replicas - self._best_val_chi2s = [INITIAL_CHI2] * self.n_replicas + self._n_replicas = len(pdf_models) + self._threshold_chi2 = threshold_chi2 + self._stopping_degrees = np.zeros(self._n_replicas, dtype=int) + self._counts = np.zeros(self._n_replicas, dtype=int) + + self._dont_stop = dont_stop + self._stop_now = False + self._stopping_patience = stopping_patience + self._total_epochs = total_epochs + + self._stop_epochs = [None] * self._n_replicas + self.best_epochs = [None] * self._n_replicas + self.positivity_statusses = [POS_BAD] * self._n_replicas + self._best_weights = [None] * self._n_replicas + self._best_val_chi2s = [INITIAL_CHI2] * self._n_replicas @property def vl_chi2(self): @@ -377,7 +377,7 @@ def e_best_chi2(self): """Epoch of the best chi2, if there is no best epoch, return last""" best_or_last_epochs = [ best if best is not None else last - for best, last in zip(self.best_epochs, self.stop_epochs) + for best, last in zip(self.best_epochs, self._stop_epochs) ] return best_or_last_epochs @@ -453,12 +453,12 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # this means improving vl_chi2 and passing positivity # Don't start counting until the chi2 of the validation goes below a certain threshold # once we start counting, don't bother anymore - passes = self.counts | (fitstate.vl_chi2 < self.threshold_chi2) + passes = self._counts | (fitstate.vl_chi2 < self._threshold_chi2) passes &= fitstate.vl_loss < self._best_val_chi2s # And the ones that pass positivity passes &= self._positivity(fitstate) - self.stopping_degrees += self.counts + self._stopping_degrees += self._counts # Step 5. loop over the valid indices to check whether the vl improved for i_replica in np.where(passes)[0]: @@ -467,18 +467,18 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): self.positivity_statusses[i_replica] = POS_OK self._best_val_chi2s[i_replica] = self._history.get_state(epoch).vl_loss[i_replica] - self._best_weights[i_replica] = self.pdf_models[i_replica].get_weights() + self._best_weights[i_replica] = self._pdf_models[i_replica].get_weights() - self.stopping_degrees[i_replica] = 0 - self.counts[i_replica] = 1 + self._stopping_degrees[i_replica] = 0 + self._counts[i_replica] = 1 - stop_replicas = self.counts & (self.stopping_degrees > self.stopping_patience) + stop_replicas = self._counts & (self._stopping_degrees > self._stopping_patience) for i_replica in np.where(stop_replicas)[0]: - self.stop_epochs[i_replica] = epoch - self.counts[i_replica] = 0 + self._stop_epochs[i_replica] = epoch + self._counts[i_replica] = 0 # By using the stopping degree we only stop when none of the replicas are improving anymore - if min(self.stopping_degrees) > self.stopping_patience: + if min(self._stopping_degrees) > self._stopping_patience: self.make_stop() return True @@ -486,11 +486,11 @@ def make_stop(self): """Convenience method to set the stop_now flag and reload the history to the point of the best model if any """ - self.stop_now = True + self._stop_now = True self._restore_best_weights() def _restore_best_weights(self): - for replica, weights in zip(self.pdf_models, self._best_weights): + for replica, weights in zip(self._pdf_models, self._best_weights): replica.set_weights(weights) def print_current_stats(self, epoch, fitstate): @@ -500,10 +500,10 @@ def print_current_stats(self, epoch, fitstate): epoch_index = epoch + 1 tr_chi2 = fitstate.total_tr_chi2() vl_chi2 = fitstate.total_vl_chi2() - total_str = f"At epoch {epoch_index}/{self.total_epochs}, total chi2: {tr_chi2}\n" + total_str = f"At epoch {epoch_index}/{self._total_epochs}, total chi2: {tr_chi2}\n" # The partial chi2 makes no sense for more than one replica at once: - if self.n_replicas == 1: + if self._n_replicas == 1: partial_tr_chi2 = fitstate.total_partial_tr_chi2() partials = [] for experiment, chi2 in partial_tr_chi2.items(): @@ -516,10 +516,10 @@ def stop_here(self): """Returns the stopping status If `dont_stop` is set returns always False (i.e., never stop) """ - if self.dont_stop: + if self._dont_stop: return False else: - return self.stop_now + return self._stop_now def chi2exps_json(self, i_replica=0, log_each=100): """ From f541e1b4094b4f1d8cf1fc3f2af09c300b38a2c7 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 16 Aug 2023 15:20:14 +0200 Subject: [PATCH 16/29] Precompute preprocessing, arclengths, integrability_numbers --- n3fit/src/n3fit/io/writer.py | 41 +++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 64b3b3424a..efa550e6d5 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -264,6 +264,17 @@ def write_data(self, save_path, fitname, weights_name): """ os.makedirs(save_path, exist_ok=True) + self.preprocessing = [ + pdf_object.get_preprocessing_factors() for pdf_object in self.pdf_objects + ] + self.arc_lengths = [ + vpinterface.compute_arclength(pdf_object).tolist() for pdf_object in self.pdf_objects + ] + self.integrability_numbers = [ + vpinterface.integrability_numbers(pdf_object).tolist() + for pdf_object in self.pdf_objects + ] + for i in range(len(self.replica_numbers)): replica_path = f"{save_path}/replica_{self.replica_numbers[i]}" os.makedirs(replica_path, exist_ok=True) @@ -284,7 +295,9 @@ def _write_metadata_json(self, i, out_path): json_dict = jsonfit( best_epoch=self.stopping_object.best_epochs[i], positivity_status=self.stopping_object.positivity_statusses[i], - pdf_object=self.pdf_objects[i], + preprocessing=self.preprocessing[i], + arc_lengths=self.arc_lengths[i], + integrability_numbers=self.integrability_numbers[i], tr_chi2=self.tr_chi2[i], vl_chi2=self.vl_chi2[i], true_chi2=self.true_chi2[i], @@ -330,7 +343,16 @@ def default(self, o): def jsonfit( - best_epoch, positivity_status, pdf_object, tr_chi2, vl_chi2, true_chi2, stop_epoch, timing + best_epoch, + positivity_status, + preprocessing, + arc_lengths, + integrability_numbers, + tr_chi2, + vl_chi2, + true_chi2, + stop_epoch, + timing, ): """Generates a dictionary containing all relevant metadata for the fit @@ -340,9 +362,12 @@ def jsonfit( epoch at which the best fit was found positivity_status: str string describing the positivity status of the fit - pdf_object: n3fit.vpinterface.N3PDF - N3PDF object constructed from the pdf_model - that receives as input a point in x and returns an array of 14 flavours + preprocessing: dict + dictionary of the preprocessing factors + arc_lengths: list + list of the arc lengths of the different PDFs + integrability_numbers: list + list of the integrability numbers of the different PDFs tr_chi2: float chi2 for the training vl_chi2: float @@ -356,7 +381,7 @@ def jsonfit( """ all_info = {} # Generate preprocessing information - all_info["preprocessing"] = pdf_object.get_preprocessing_factors() + all_info["preprocessing"] = preprocessing # .fitinfo-like info all_info["stop_epoch"] = stop_epoch all_info["best_epoch"] = best_epoch @@ -364,8 +389,8 @@ def jsonfit( all_info["erf_vl"] = vl_chi2 all_info["chi2"] = true_chi2 all_info["pos_state"] = positivity_status - all_info["arc_lengths"] = vpinterface.compute_arclength(pdf_object).tolist() - all_info["integrability"] = vpinterface.integrability_numbers(pdf_object).tolist() + all_info["arc_lengths"] = arc_lengths + all_info["integrability"] = integrability_numbers all_info["timing"] = timing # Versioning info all_info["version"] = version() From 9f07416260ffe37084e85fb8b488bb7b6665bcb4 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 4 Sep 2023 11:22:19 +0200 Subject: [PATCH 17/29] Add if statement before restoring best weights --- n3fit/src/n3fit/stopping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 848f57f2f8..955eebb841 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -491,7 +491,8 @@ def make_stop(self): def _restore_best_weights(self): for replica, weights in zip(self._pdf_models, self._best_weights): - replica.set_weights(weights) + if weights: + replica.set_weights(weights) def print_current_stats(self, epoch, fitstate): """ From 78a6d3007f6e73e6fd4fc36910e991bccc85fbf8 Mon Sep 17 00:00:00 2001 From: Aron Date: Mon, 4 Sep 2023 14:02:07 +0200 Subject: [PATCH 18/29] Add default best_epoch to last_epoch --- n3fit/src/n3fit/stopping.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 955eebb841..5200d541bb 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -359,8 +359,8 @@ def __init__( self._stopping_patience = stopping_patience self._total_epochs = total_epochs - self._stop_epochs = [None] * self._n_replicas - self.best_epochs = [None] * self._n_replicas + self._stop_epochs = [total_epochs - 1] * self._n_replicas + self._best_epochs = [None] * self._n_replicas self.positivity_statusses = [POS_BAD] * self._n_replicas self._best_weights = [None] * self._n_replicas self._best_val_chi2s = [INITIAL_CHI2] * self._n_replicas @@ -377,7 +377,7 @@ def e_best_chi2(self): """Epoch of the best chi2, if there is no best epoch, return last""" best_or_last_epochs = [ best if best is not None else last - for best, last in zip(self.best_epochs, self._stop_epochs) + for best, last in zip(self._best_epochs, self._stop_epochs) ] return best_or_last_epochs @@ -386,6 +386,14 @@ def stop_epoch(self): """Epoch in which the fit is stopped""" return self._history.final_epoch + 1 + @property + def best_epochs(self): + """Epochs in which the best chi2 was found, or the last one if no best was found""" + best_or_last_epochs = self._best_epochs + for i_replica in np.where([be == None for be in self._best_epochs])[0]: + best_or_last_epochs[i_replica] = self._stop_epochs[i_replica] + return best_or_last_epochs + @property def positivity_status(self): """Returns POS_PASS if positivity passes or veto if it doesn't @@ -462,7 +470,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): # Step 5. loop over the valid indices to check whether the vl improved for i_replica in np.where(passes)[0]: - self.best_epochs[i_replica] = epoch + self._best_epochs[i_replica] = epoch # By definition, if we have a ``best_epoch`` then positivity passed self.positivity_statusses[i_replica] = POS_OK From f8b985ff9bdac59f692861fb7632c35aa71f9264 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 4 Oct 2023 13:00:21 +0200 Subject: [PATCH 19/29] statusses -> statuses --- n3fit/src/n3fit/io/writer.py | 2 +- n3fit/src/n3fit/stopping.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index efa550e6d5..0b1b27e49f 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -294,7 +294,7 @@ def _write_chi2s(self, out_path): def _write_metadata_json(self, i, out_path): json_dict = jsonfit( best_epoch=self.stopping_object.best_epochs[i], - positivity_status=self.stopping_object.positivity_statusses[i], + positivity_status=self.stopping_object.positivity_statuses[i], preprocessing=self.preprocessing[i], arc_lengths=self.arc_lengths[i], integrability_numbers=self.integrability_numbers[i], diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 5200d541bb..5b6ddd18ac 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -361,7 +361,7 @@ def __init__( self._stop_epochs = [total_epochs - 1] * self._n_replicas self._best_epochs = [None] * self._n_replicas - self.positivity_statusses = [POS_BAD] * self._n_replicas + self.positivity_statuses = [POS_BAD] * self._n_replicas self._best_weights = [None] * self._n_replicas self._best_val_chi2s = [INITIAL_CHI2] * self._n_replicas @@ -398,7 +398,7 @@ def best_epochs(self): def positivity_status(self): """Returns POS_PASS if positivity passes or veto if it doesn't for each replica""" - return self.positivity_statusses + return self.positivity_statuses def evaluate_training(self, training_model): """Given the training model, evaluates the @@ -472,7 +472,7 @@ def monitor_chi2(self, training_info, epoch, print_stats=False): for i_replica in np.where(passes)[0]: self._best_epochs[i_replica] = epoch # By definition, if we have a ``best_epoch`` then positivity passed - self.positivity_statusses[i_replica] = POS_OK + self.positivity_statuses[i_replica] = POS_OK self._best_val_chi2s[i_replica] = self._history.get_state(epoch).vl_loss[i_replica] self._best_weights[i_replica] = self._pdf_models[i_replica].get_weights() From e2f65eca95f253bd9c323af5478776e4434d11db Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 4 Oct 2023 13:05:06 +0200 Subject: [PATCH 20/29] combine 3 list comprehensions into one loop --- n3fit/src/n3fit/io/writer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 0b1b27e49f..4c1b9ec610 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -264,16 +264,15 @@ def write_data(self, save_path, fitname, weights_name): """ os.makedirs(save_path, exist_ok=True) - self.preprocessing = [ - pdf_object.get_preprocessing_factors() for pdf_object in self.pdf_objects - ] - self.arc_lengths = [ - vpinterface.compute_arclength(pdf_object).tolist() for pdf_object in self.pdf_objects - ] - self.integrability_numbers = [ - vpinterface.integrability_numbers(pdf_object).tolist() - for pdf_object in self.pdf_objects - ] + self.preprocessing = [] + self.arc_lengths = [] + self.integrability_numbers = [] + for pdf_object in self.pdf_objects: + self.preprocessing.append(pdf_object.get_preprocessing_factors()) + self.arc_lengths.append(vpinterface.compute_arclength(pdf_object).tolist()) + self.integrability_numbers.append( + vpinterface.integrability_numbers(pdf_object).tolist() + ) for i in range(len(self.replica_numbers)): replica_path = f"{save_path}/replica_{self.replica_numbers[i]}" From 6b364cfaa04764249d3105fdf55266e3d270abd3 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 4 Oct 2023 13:07:28 +0200 Subject: [PATCH 21/29] Clarify comment --- n3fit/src/n3fit/io/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 4c1b9ec610..8dc33e3bc0 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -300,7 +300,7 @@ def _write_metadata_json(self, i, out_path): tr_chi2=self.tr_chi2[i], vl_chi2=self.vl_chi2[i], true_chi2=self.true_chi2[i], - # Note: last 2 same for all replicas, unless run separately + # Note: last 2 arguments below are the same for all replicas, unless run separately timing=self.timings, stop_epoch=self.stopping_object.stop_epoch, ) From 5c3c2e87153edb81bb39085a79d408fe095a9336 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 4 Oct 2023 13:20:29 +0200 Subject: [PATCH 22/29] Using PurePath instead of str --- n3fit/src/n3fit/io/writer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 8dc33e3bc0..cbccd7beec 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -7,6 +7,7 @@ import json import logging import os +from pathlib import PurePath import numpy as np @@ -328,8 +329,8 @@ def _write_weights(self, i, out_path): log.info(" > Saving the weights for future in %s", out_path) # Extract model out of N3PDF model = self.pdf_objects[i]._models[0] - # Need to use "str" here because TF 2.2 has a bug for paths objects (fixed in 2.3) - model.save_weights(str(out_path), save_format="h5") + # PurePath to avoid tensorflow 2.2 bug with paths + model.save_weights(PurePath(out_path), save_format="h5") class SuperEncoder(json.JSONEncoder): From 905218c283977003a17b5e4a03272776eeb198f5 Mon Sep 17 00:00:00 2001 From: Aron Jansen Date: Tue, 10 Oct 2023 13:34:16 +0200 Subject: [PATCH 23/29] improve documentation Co-authored-by: Juan M. Cruz-Martinez --- n3fit/src/n3fit/io/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index cbccd7beec..b5e4117abb 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -257,7 +257,7 @@ def write_data(self, save_path, fitname, weights_name): Parameters ---------- `save_path` - path for the replica, ex: `${PWD}/runcard_name/nnfit` + path for the fit results, ex: `${PWD}/runcard_name/nnfit` `fitname` name of the fit, ex: `Basic_runcard` `weights_name` From 17ff6020642d96f337195b812053493922174b29 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 10 Oct 2023 13:42:45 +0200 Subject: [PATCH 24/29] Use path object syntax --- n3fit/src/n3fit/io/writer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index b5e4117abb..98e2674e8f 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -6,7 +6,6 @@ """ import json import logging -import os from pathlib import PurePath import numpy as np @@ -263,7 +262,7 @@ def write_data(self, save_path, fitname, weights_name): `weights_name` name of the file to save weights to, if not empty """ - os.makedirs(save_path, exist_ok=True) + save_path.mkdir(exist_ok=True, parents=True) self.preprocessing = [] self.arc_lengths = [] @@ -275,15 +274,15 @@ def write_data(self, save_path, fitname, weights_name): vpinterface.integrability_numbers(pdf_object).tolist() ) - for i in range(len(self.replica_numbers)): - replica_path = f"{save_path}/replica_{self.replica_numbers[i]}" - os.makedirs(replica_path, exist_ok=True) + for i, rn in enumerate(self.replica_numbers): + replica_path = save_path / f"replica_{rn}" + replica_path.mkdir(exist_ok=True, parents=True) - self._write_chi2s(f"{replica_path}/chi2exps.log") - self._write_metadata_json(i, f"{replica_path}/{fitname}.json") - self._export_pdf_grid(i, f"{replica_path}/{fitname}.exportgrid") + self._write_chi2s(replica_path / "chi2exps.log") + self._write_metadata_json(i, replica_path / f"{fitname}.json") + self._export_pdf_grid(i, replica_path / f"{fitname}.exportgrid") if weights_name: - self._write_weights(i, f"{replica_path}/{weights_name}") + self._write_weights(i, replica_path / f"{weights_name}") def _write_chi2s(self, out_path): # Note: same for all replicas, unless run separately @@ -293,7 +292,7 @@ def _write_chi2s(self, out_path): def _write_metadata_json(self, i, out_path): json_dict = jsonfit( - best_epoch=self.stopping_object.best_epochs[i], + best_epoch=self.stopping_object.best_epochs, positivity_status=self.stopping_object.positivity_statuses[i], preprocessing=self.preprocessing[i], arc_lengths=self.arc_lengths[i], From d82c826d281ccdacf0b18b5c4068fda2d802380a Mon Sep 17 00:00:00 2001 From: Aron Jansen Date: Tue, 10 Oct 2023 13:43:53 +0200 Subject: [PATCH 25/29] Update n3fit/src/n3fit/stopping.py Co-authored-by: Juan M. Cruz-Martinez --- n3fit/src/n3fit/stopping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 5b6ddd18ac..657322f888 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -499,7 +499,7 @@ def make_stop(self): def _restore_best_weights(self): for replica, weights in zip(self._pdf_models, self._best_weights): - if weights: + if weights is not None: replica.set_weights(weights) def print_current_stats(self, epoch, fitstate): From 93afd75f1c2f773f3c23cf38c9a84a9d8e863859 Mon Sep 17 00:00:00 2001 From: Aron Jansen Date: Tue, 10 Oct 2023 13:46:42 +0200 Subject: [PATCH 26/29] Update n3fit/src/n3fit/io/writer.py Co-authored-by: Juan M. Cruz-Martinez --- n3fit/src/n3fit/io/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 98e2674e8f..79afc60135 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -300,7 +300,7 @@ def _write_metadata_json(self, i, out_path): tr_chi2=self.tr_chi2[i], vl_chi2=self.vl_chi2[i], true_chi2=self.true_chi2[i], - # Note: last 2 arguments below are the same for all replicas, unless run separately + # Note: the 2 arguments below are the same for all replicas, unless run separately timing=self.timings, stop_epoch=self.stopping_object.stop_epoch, ) From 5a6f743938d7b84d8f7397cc9a617ee5aec7c5ad Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 10 Oct 2023 13:57:22 +0200 Subject: [PATCH 27/29] Remove fix for tensorflow 2.2 path bug --- n3fit/src/n3fit/io/writer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 79afc60135..679e45aba1 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -328,8 +328,7 @@ def _write_weights(self, i, out_path): log.info(" > Saving the weights for future in %s", out_path) # Extract model out of N3PDF model = self.pdf_objects[i]._models[0] - # PurePath to avoid tensorflow 2.2 bug with paths - model.save_weights(PurePath(out_path), save_format="h5") + model.save_weights(out_path, save_format="h5") class SuperEncoder(json.JSONEncoder): From e0b9cc885ae2a2bd1ece8208f524b4aac732d630 Mon Sep 17 00:00:00 2001 From: Aron Date: Tue, 10 Oct 2023 14:34:35 +0200 Subject: [PATCH 28/29] Remove best_epoch method --- n3fit/src/n3fit/io/writer.py | 2 +- n3fit/src/n3fit/stopping.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 679e45aba1..1846cb7226 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -292,7 +292,7 @@ def _write_chi2s(self, out_path): def _write_metadata_json(self, i, out_path): json_dict = jsonfit( - best_epoch=self.stopping_object.best_epochs, + best_epoch=self.stopping_object.e_best_chi2[i], positivity_status=self.stopping_object.positivity_statuses[i], preprocessing=self.preprocessing[i], arc_lengths=self.arc_lengths[i], diff --git a/n3fit/src/n3fit/stopping.py b/n3fit/src/n3fit/stopping.py index 657322f888..0c9bc99e9a 100644 --- a/n3fit/src/n3fit/stopping.py +++ b/n3fit/src/n3fit/stopping.py @@ -386,14 +386,6 @@ def stop_epoch(self): """Epoch in which the fit is stopped""" return self._history.final_epoch + 1 - @property - def best_epochs(self): - """Epochs in which the best chi2 was found, or the last one if no best was found""" - best_or_last_epochs = self._best_epochs - for i_replica in np.where([be == None for be in self._best_epochs])[0]: - best_or_last_epochs[i_replica] = self._stop_epochs[i_replica] - return best_or_last_epochs - @property def positivity_status(self): """Returns POS_PASS if positivity passes or veto if it doesn't From 2396d44b6745fccbd887206b846ba07fa1dc9338 Mon Sep 17 00:00:00 2001 From: "Juan M. Cruz-Martinez" Date: Wed, 11 Oct 2023 14:18:40 +0200 Subject: [PATCH 29/29] remove PurePath --- n3fit/src/n3fit/io/writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py index 1846cb7226..4fa71eef58 100644 --- a/n3fit/src/n3fit/io/writer.py +++ b/n3fit/src/n3fit/io/writer.py @@ -6,7 +6,6 @@ """ import json import logging -from pathlib import PurePath import numpy as np