From be4f48ac6d0a0c28e3ead8a980163c895d52a373 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 28 May 2020 23:06:31 -0700 Subject: [PATCH 001/149] save outputs --- smdebug/core/tfevent/event_file_reader.py | 1 + smdebug/tensorflow/base_hook.py | 2 +- smdebug/tensorflow/collection.py | 2 +- smdebug/tensorflow/keras.py | 19 ++++++++++++++++++- tests/tensorflow2/test_keras.py | 13 +++++++++++-- 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index c5f4b5fd9..176fb9e38 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,6 +38,7 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, + types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 188f6c71e..03de7358e 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -437,7 +437,7 @@ def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ # this tensor_name is tf tensor name, need to convert to export_name tensor_ref = self._get_tensor_ref(tensor_name, save_collections=save_collections) - if tensor_ref: + if tensor_ref is not None: name = tensor_ref.export_name super()._write_for_tensor( name, tensor_value, save_collections=save_collections, tensor_ref=tensor_ref diff --git a/smdebug/tensorflow/collection.py b/smdebug/tensorflow/collection.py index 89df3916a..900716479 100644 --- a/smdebug/tensorflow/collection.py +++ b/smdebug/tensorflow/collection.py @@ -58,7 +58,7 @@ def add_distributed_variable(self, arg, export_name=None, mode=None): def add_aggregating_variable(self, arg, name=None, mode=None): return self.add_variable(arg.get(), name, mode=mode) - def add_tensor(self, arg, name=None, mode=None): + def add_tensor(self, arg, name=None, mode=None, type=None): # in keras we need to store the mode and only get tensors by mode return self._store_tensor_ref(TensorRef.from_tensor(arg, name, mode=mode)) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index fea005a31..a0629cf5a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -366,6 +366,21 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} + def _save_model_outputs(self, logs): + if logs is None: + return + + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): + export_names = {"y_pred": "train_output/y_pred", "y": "train_output/y"} + self._initialize_writers(only_initialize_if_missing=True) + output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) + for key in logs: + if key in ["y", "y_pred"]: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) + self.tensor_to_collections[export_names[key]] = {output_collection} + self._save_for_tensor(export_names[key], logs[key], check_before_write=False) + def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps if logs is None: @@ -375,7 +390,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"]: + if key in ["loss", "val_loss", "outputs", "y", "y_pred"]: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -392,6 +407,7 @@ def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) + self._save_model_outputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -479,6 +495,7 @@ def on_epoch_end(self, batch, logs=None): if self._is_not_supported(): return self._save_metrics(batch=batch, logs=logs, force_save=True) + self._save_model_outputs(logs=logs) self._close_writers() def _on_any_mode_begin(self, mode): diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ae972126a..16e6e3c47 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -412,9 +412,14 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (13 if is_tf_2_2() else 14) + assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) else: - assert len(trial.tensor_names()) == 21 + assert len(trial.tensor_names()) == 24 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert ( + len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.PREDICT)) == 0 + ) # bug: + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.TRAIN)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 @@ -427,6 +432,10 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): == 0, "No Optimizer Variables Should be Saved in EVAL Mode", ) + for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + output = trial.tensor(tname) + assert tname in ["train_output/y_pred", "train_output/y", "predict_output"] + assert output.value(0) is not None else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From d32d01796969e225ebbe47f77fa0ce04fe6cb416 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 2 Jun 2020 23:26:14 -0700 Subject: [PATCH 002/149] assert updates --- tests/tensorflow2/test_keras.py | 6 +++--- tests/tensorflow2/test_keras_mirrored.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 16e6e3c47..4339dd39f 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -412,7 +412,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) + assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) else: assert len(trial.tensor_names()) == 24 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 @@ -593,7 +593,7 @@ def test_include_collections(out_dir, tf_eager_mode): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x if tf_eager_mode: - assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) + assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) else: assert len(trial.tensor_names()) == 18 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 @@ -654,7 +654,7 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) + assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index 3ff6f307a..d7630fb9f 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -256,8 +256,10 @@ def test_save_all(out_dir, tf_eager_mode, workers): tr = create_trial_fast_refresh(out_dir) print(tr.tensor_names()) if tf_eager_mode: - assert len(tr.tensor_names()) == (6 + 2 + 1 + 5 + 1 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1) - # weights, metrics, losses, optimizer variables, scalar + assert len(tr.tensor_names()) == ( + 6 + 2 + 1 + 5 + 1 + 2 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1 + 2 + ) + # weights, metrics, losses, optimizer variables, scalar, model outputs else: assert ( len(tr.tensor_names()) From 8e95f127434a2ac6ae946b671f74ae9f3d0670c5 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 2 Jun 2020 23:43:21 -0700 Subject: [PATCH 003/149] update assert --- tests/tensorflow2/test_keras_mirrored.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index d7630fb9f..f28deaadd 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -165,10 +165,10 @@ def exhaustive_check(trial_dir, include_workers="one", eager=True): assert len(tr.workers()) == strategy.num_replicas_in_sync if eager: assert len(tr.tensor_names()) == ( - 6 + 1 + 2 + 5 + 1 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 + 6 + 1 + 2 + 5 + 1 + 2 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 + 2 ) - # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar - # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar + # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar, 2 model outputs + # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar, 2 model outputs else: assert len(tr.tensor_names()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5) else: From 48f45d6db12382639453b5b952611dcdf785f42c Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 3 Jun 2020 00:20:05 -0700 Subject: [PATCH 004/149] cleanup --- smdebug/core/tfevent/event_file_reader.py | 1 - smdebug/tensorflow/collection.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index 176fb9e38..c5f4b5fd9 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,7 +38,6 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, - types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/collection.py b/smdebug/tensorflow/collection.py index 900716479..89df3916a 100644 --- a/smdebug/tensorflow/collection.py +++ b/smdebug/tensorflow/collection.py @@ -58,7 +58,7 @@ def add_distributed_variable(self, arg, export_name=None, mode=None): def add_aggregating_variable(self, arg, name=None, mode=None): return self.add_variable(arg.get(), name, mode=mode) - def add_tensor(self, arg, name=None, mode=None, type=None): + def add_tensor(self, arg, name=None, mode=None): # in keras we need to store the mode and only get tensors by mode return self._store_tensor_ref(TensorRef.from_tensor(arg, name, mode=mode)) From 55f10d4d2bb6b34e4e056d00fcecff999e55c60a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 3 Jun 2020 02:47:30 -0700 Subject: [PATCH 005/149] as_dtype: --- smdebug/core/tfevent/event_file_reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index c5f4b5fd9..176fb9e38 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,6 +38,7 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, + types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] From ec8202153448c19b5119c662a2bd94ed21f5241d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 01:13:32 -0700 Subject: [PATCH 006/149] model outputs are now constants --- smdebug/tensorflow/keras.py | 11 +++++++---- smdebug/tensorflow/utils.py | 5 +++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a0629cf5a..84fa8e613 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -15,6 +15,7 @@ from .collection import CollectionKeys from .tensor_ref import TensorRef, get_tf_names from .utils import ( + ModelOutput, TFDistributionStrategy, get_export_name_for_keras, get_keras_layer_inputs, @@ -371,11 +372,14 @@ def _save_model_outputs(self, logs): return if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): - export_names = {"y_pred": "train_output/y_pred", "y": "train_output/y"} + export_names = { + ModelOutput.Y_PRED: "train_output/y_pred", + ModelOutput.Y: "train_output/y", + } self._initialize_writers(only_initialize_if_missing=True) output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: - if key in ["y", "y_pred"]: + if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} @@ -390,7 +394,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs", "y", "y_pred"]: + if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -495,7 +499,6 @@ def on_epoch_end(self, batch, logs=None): if self._is_not_supported(): return self._save_metrics(batch=batch, logs=logs, force_save=True) - self._save_model_outputs(logs=logs) self._close_writers() def _on_any_mode_begin(self, mode): diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c58469023..5bd94329b 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -12,6 +12,11 @@ from smdebug.core.modes import ModeKeys +class ModelOutput: + Y = "y" + Y_PRED = "y_pred" + + class TFDistributionStrategy(Enum): NONE = 0 HOROVOD = 1 From 666bcd4d5bf62c2212407b4b978d371d139c6f89 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 02:40:13 -0700 Subject: [PATCH 007/149] update to test --- tests/tensorflow2/test_keras.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4339dd39f..38e238231 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -22,6 +22,7 @@ from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.utils import ModelOutput from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig @@ -396,7 +397,8 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow @pytest.mark.parametrize("saveall", [True, False]) def test_keras_fit(out_dir, tf_eager_mode, saveall): - hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) + save_config = SaveConfig(save_interval=1) if saveall else None + hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=save_config) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) scalars_to_be_saved = dict() @@ -411,6 +413,9 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 20 + assert len(trial.steps(mode=ModeKeys.EVAL)) == 10 + assert len(trial.steps(mode=ModeKeys.PREDICT)) == 4 if tf_eager_mode: assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) else: @@ -434,8 +439,14 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in ["train_output/y_pred", "train_output/y", "predict_output"] + assert tname in [ModelOutput.Y_PRED, ModelOutput.Y] assert output.value(0) is not None + assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) + # Check the shape of output tensors + assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label + assert ( + trial.tensor(ModelOutput.Y_PRED).value(0).shape[1] == 10 + ) # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From d867a9bf1689e2bfc147628b8eb26e5c62132e3e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 02:41:37 -0700 Subject: [PATCH 008/149] update import statement --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 38e238231..4227046d8 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -22,9 +22,9 @@ from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS -from smdebug.core.utils import ModelOutput from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig +from smdebug.tensorflow.utils import ModelOutput def helper_keras_fit( From 5fd3a7445fa555c1525787e38b7cdde00d88cdd4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 22:45:52 -0700 Subject: [PATCH 009/149] tmp --- smdebug/tensorflow/keras.py | 44 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 84fa8e613..ee1b97cd3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -352,38 +352,36 @@ def _save_inputs(self, check_before_write=True): # TODO pass - def _add_metric(self, metric_name, metric_value: tf.Tensor = None): - if metric_name in self.tensor_to_collections: + def _save_tensor(self, t_name: str, t_value: tf.Tensor, collection: CollectionKeys): + if t_name in self.tensor_to_collections: return + coll = self.collection_manager.get(collection) + if isinstance(t_value, tf.Tensor): + coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) + else: + coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) + self.tensor_to_collections[t_name] = {coll} + self._initialize_writers(only_initialize_if_missing=True) + self._save_for_tensor(t_name, t_value, check_before_write=False) + def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in ["loss", "val_loss"]: coll_name = CollectionKeys.LOSSES else: coll_name = CollectionKeys.METRICS - coll = self.collection_manager.get(coll_name) - if metric_value: - coll.set_tensor_ref(metric_value, metric_name) - else: - coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) - self.tensor_to_collections[metric_name] = {coll} + self._save_tensor(metric_name, metric_value, coll_name) def _save_model_outputs(self, logs): if logs is None: return - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): export_names = { ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y", } - self._initialize_writers(only_initialize_if_missing=True) - output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - output_collection.set_tensor_ref(tensor_ref) - self.tensor_to_collections[export_names[key]] = {output_collection} - self._save_for_tensor(export_names[key], logs[key], check_before_write=False) + self._save_tensor(export_names[key], logs[key], CollectionKeys.OUTPUTS) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -391,27 +389,33 @@ def _save_metrics(self, batch, logs, force_save=False): return if force_save or self._is_collection_being_saved_for_step(CollectionKeys.METRICS): - self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue - self._add_metric(metric_name=key) - self._save_for_tensor(key, logs[key], check_before_write=False) + self._add_metric(metric_name=key, metric_value=logs[key]) if force_save or self._is_collection_being_saved_for_step(CollectionKeys.LOSSES): self._initialize_writers(only_initialize_if_missing=True) for key in ["loss", "val_loss"]: if key in logs: - self._add_metric(metric_name=key) - self._save_for_tensor(key, logs[key], check_before_write=False) + self._add_metric(metric_name=key, metric_value=logs[key]) + + def _save_gradients(self, logs): + if logs is None: + return + + if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): + if "gradients" in logs: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) + self._save_gradients(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From 11c20c672716fe57248655c0003d8ad9be54dcf0 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 7 Jun 2020 23:07:32 -0700 Subject: [PATCH 010/149] Revert "tmp" This reverts commit 5fd3a7445fa555c1525787e38b7cdde00d88cdd4. --- smdebug/tensorflow/keras.py | 44 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ee1b97cd3..84fa8e613 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -352,36 +352,38 @@ def _save_inputs(self, check_before_write=True): # TODO pass - def _save_tensor(self, t_name: str, t_value: tf.Tensor, collection: CollectionKeys): - if t_name in self.tensor_to_collections: + def _add_metric(self, metric_name, metric_value: tf.Tensor = None): + if metric_name in self.tensor_to_collections: return - coll = self.collection_manager.get(collection) - if isinstance(t_value, tf.Tensor): - coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) - else: - coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) - self.tensor_to_collections[t_name] = {coll} - self._initialize_writers(only_initialize_if_missing=True) - self._save_for_tensor(t_name, t_value, check_before_write=False) - def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in ["loss", "val_loss"]: coll_name = CollectionKeys.LOSSES else: coll_name = CollectionKeys.METRICS - self._save_tensor(metric_name, metric_value, coll_name) + coll = self.collection_manager.get(coll_name) + if metric_value: + coll.set_tensor_ref(metric_value, metric_name) + else: + coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) + self.tensor_to_collections[metric_name] = {coll} def _save_model_outputs(self, logs): if logs is None: return + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): export_names = { ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y", } + self._initialize_writers(only_initialize_if_missing=True) + output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - self._save_tensor(export_names[key], logs[key], CollectionKeys.OUTPUTS) + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) + self.tensor_to_collections[export_names[key]] = {output_collection} + self._save_for_tensor(export_names[key], logs[key], check_before_write=False) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -389,33 +391,27 @@ def _save_metrics(self, batch, logs, force_save=False): return if force_save or self._is_collection_being_saved_for_step(CollectionKeys.METRICS): + self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue - self._add_metric(metric_name=key, metric_value=logs[key]) + self._add_metric(metric_name=key) + self._save_for_tensor(key, logs[key], check_before_write=False) if force_save or self._is_collection_being_saved_for_step(CollectionKeys.LOSSES): self._initialize_writers(only_initialize_if_missing=True) for key in ["loss", "val_loss"]: if key in logs: - self._add_metric(metric_name=key, metric_value=logs[key]) - - def _save_gradients(self, logs): - if logs is None: - return - - if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): - if "gradients" in logs: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + self._add_metric(metric_name=key) + self._save_for_tensor(key, logs[key], check_before_write=False) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) - self._save_gradients(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From 7f260e5e4b053647722a74ac057213e015a5cfff Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 7 Jun 2020 23:08:13 -0700 Subject: [PATCH 011/149] str_to_mode --- smdebug/core/modes.py | 13 +++++++++++++ smdebug/core/save_config.py | 4 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/smdebug/core/modes.py b/smdebug/core/modes.py index fce21ad1d..809df1ce9 100644 --- a/smdebug/core/modes.py +++ b/smdebug/core/modes.py @@ -14,3 +14,16 @@ class ModeKeys(Enum): ALLOWED_MODE_NAMES = [x.name for x in ALLOWED_MODES] MODE_STEP_PLUGIN_NAME = "mode_step" MODE_PLUGIN_NAME = "mode" + + +def str_to_mode_keys(s): + if s == "train": + return ModeKeys.TRAIN + elif s == "eval": + return ModeKeys.EVAL + elif s == "predict": + return ModeKeys.PREDICT + elif s == "global": + return ModeKeys.GLOBAL + else: + raise Exception("Invalid mode") diff --git a/smdebug/core/save_config.py b/smdebug/core/save_config.py index 5955c3d26..e4758df94 100644 --- a/smdebug/core/save_config.py +++ b/smdebug/core/save_config.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Union # First Party -from smdebug.core.modes import ModeKeys +from smdebug.core.modes import ModeKeys, str_to_mode_keys from smdebug.core.utils import step_in_range DEFAULT_SAVE_CONFIG_INTERVAL = 500 @@ -83,6 +83,8 @@ def set_save_config(self, mode: ModeKeys, save_config_mode: "SaveConfigMode") -> self.mode_save_configs[mode] = save_config_mode def should_save_step(self, mode, step_num) -> bool: + if isinstance(mode, str): + mode = str_to_mode_keys(mode) return self.get_save_config(mode).should_save_step(step_num) def to_json_dict(self) -> Dict: From 345f7850a88f46f55ed3e3a6e8b880495a96d57c Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 00:25:12 -0700 Subject: [PATCH 012/149] add tensor --- smdebug/tensorflow/keras.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 84fa8e613..da2b2997d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -380,8 +380,12 @@ def _save_model_outputs(self, logs): output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - output_collection.set_tensor_ref(tensor_ref) + tensor_value = logs[key] + if isinstance(tensor_value, values.PerReplica): + output_collection.add(tensor_value) + else: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} self._save_for_tensor(export_names[key], logs[key], check_before_write=False) From beaa68dfa34f0a4c6ebdcd0ea4d2a8f19d4348c4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 00:29:44 -0700 Subject: [PATCH 013/149] add tensor --- smdebug/tensorflow/keras.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index da2b2997d..4a1e78903 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -382,7 +382,9 @@ def _save_model_outputs(self, logs): if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_value = logs[key] if isinstance(tensor_value, values.PerReplica): - output_collection.add(tensor_value) + output_collection.add_distributed_variable( + tensor_value, export_names=export_names[key] + ) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) output_collection.set_tensor_ref(tensor_ref) From ab3d5c11cc256c2cd0f7f5431c96c4b88df56259 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:18:30 -0700 Subject: [PATCH 014/149] add dist tensor: --- smdebug/tensorflow/keras.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4a1e78903..15556ff3d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -381,12 +381,15 @@ def _save_model_outputs(self, logs): for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_value = logs[key] + tensor_refs = [] if isinstance(tensor_value, values.PerReplica): - output_collection.add_distributed_variable( - tensor_value, export_names=export_names[key] - ) + for t in tensor_value: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append(tensor_ref) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append(tensor_ref) + for tensor_ref in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} self._save_for_tensor(export_names[key], logs[key], check_before_write=False) From 61372e8688e05e1473db3e8a9222164a132e1884 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:27:09 -0700 Subject: [PATCH 015/149] add tensor --- smdebug/tensorflow/keras.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 15556ff3d..1e8b82f09 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -383,16 +383,19 @@ def _save_model_outputs(self, logs): tensor_value = logs[key] tensor_refs = [] if isinstance(tensor_value, values.PerReplica): - for t in tensor_value: + for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) tensor_refs.append(tensor_ref) + self._save_for_tensor(export_names[key], t, check_before_write=False) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) tensor_refs.append(tensor_ref) + self._save_for_tensor( + export_names[key], logs[key], check_before_write=False + ) for tensor_ref in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} - self._save_for_tensor(export_names[key], logs[key], check_before_write=False) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 46c5e0ff107f0532a85589c6cb03ae25b09d316c Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:30:32 -0700 Subject: [PATCH 016/149] for-loop --- smdebug/tensorflow/keras.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1e8b82f09..729048be6 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -385,16 +385,14 @@ def _save_model_outputs(self, logs): if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref) - self._save_for_tensor(export_names[key], t, check_before_write=False) + tensor_refs.append(tensor_ref, t) + else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref) - self._save_for_tensor( - export_names[key], logs[key], check_before_write=False - ) - for tensor_ref in tensor_refs: + tensor_refs.append(tensor_ref, logs[key]) + for tensor_ref, t in tensor_refs: output_collection.set_tensor_ref(tensor_ref) + self._save_for_tensor(export_names[key], t, check_before_write=False) self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): From 650fd6aa1f6bc7a2d788333954f45c4bd3bb8dd2 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:41:22 -0700 Subject: [PATCH 017/149] fix append --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 729048be6..dcfb029a3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -385,11 +385,11 @@ def _save_model_outputs(self, logs): if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref, t) + tensor_refs.append((tensor_ref, t)) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref, logs[key]) + tensor_refs.append((tensor_ref, logs[key])) for tensor_ref, t in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) From 42fdc3aae2a884e1b0b41f0aebcb74cb7e8accf9 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:51:46 -0700 Subject: [PATCH 018/149] fix assert --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4227046d8..be5ce5df5 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -439,7 +439,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in [ModelOutput.Y_PRED, ModelOutput.Y] + assert tname in ["train_output/y", "train_output/y_pred"] assert output.value(0) is not None assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors From 16b38d16506d59d316bef5be808dcfe9cb247122 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:59:49 -0700 Subject: [PATCH 019/149] add --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index be5ce5df5..368f7f6ed 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -441,7 +441,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): output = trial.tensor(tname) assert tname in ["train_output/y", "train_output/y_pred"] assert output.value(0) is not None - assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) + assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label assert ( From 9e1d2c5bc45cb15b7680d0acdd26dcd203238339 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 02:02:54 -0700 Subject: [PATCH 020/149] model output --- tests/tensorflow2/test_keras.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 368f7f6ed..543829f6a 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -24,7 +24,6 @@ from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig -from smdebug.tensorflow.utils import ModelOutput def helper_keras_fit( @@ -443,9 +442,9 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): assert output.value(0) is not None assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors - assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label + assert trial.tensor("train_output/y").value(0).shape[1] == 1 # label assert ( - trial.tensor(ModelOutput.Y_PRED).value(0).shape[1] == 10 + trial.tensor("train_output/y_pred").value(0).shape[1] == 10 ) # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) From 14d911bbca8b8b31d84ab14bb538d02fb0c87c14 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 02:36:27 -0700 Subject: [PATCH 021/149] rename --- smdebug/tensorflow/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 5bd94329b..3c8e8176e 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -13,8 +13,8 @@ class ModelOutput: - Y = "y" - Y_PRED = "y_pred" + Y = "smdebug_y" + Y_PRED = "smdebug_y_pred" class TFDistributionStrategy(Enum): From 20d0413f0a6cb724059a8f3badf9b6517c8dc513 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 02:51:31 -0700 Subject: [PATCH 022/149] add to all collections --- smdebug/tensorflow/keras.py | 43 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index dcfb029a3..141c4aa55 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -371,29 +371,34 @@ def _save_model_outputs(self, logs): if logs is None: return - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): - export_names = { - ModelOutput.Y_PRED: "train_output/y_pred", - ModelOutput.Y: "train_output/y", - } - self._initialize_writers(only_initialize_if_missing=True) - output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) - for key in logs: - if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, t)) + export_names = {ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y"} - else: + for key in logs: + if key in [ModelOutput.Y, ModelOutput.Y_PRED]: + collections_to_save = self._get_collections_to_save_for_step() + collections_to_save = self._get_collections_with_tensor( + export_names[key] + ).intersection(collections_to_save) + + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, logs[key])) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append((tensor_ref, logs[key])) + + for collection in collections_to_save: for tensor_ref, t in tensor_refs: - output_collection.set_tensor_ref(tensor_ref) + collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) - self.tensor_to_collections[export_names[key]] = {output_collection} + if export_names[key] not in self.tensor_to_collections: + self.tensor_to_collections[export_names[key]] = {collection} + else: + self.tensor_to_collections[export_names[key]].add(collection) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From d46ebb6762635afb7b42ed0264b6b00f89bfc3da Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 03:18:23 -0700 Subject: [PATCH 023/149] revert --- smdebug/tensorflow/keras.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 141c4aa55..ad6a6e121 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -320,6 +320,7 @@ def _prepare_non_layer_tensors(self): for coll in [ self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), + self.get_collection(name=CollectionKeys.OUTPUTS), ]: for tensor_ref in coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: @@ -376,29 +377,24 @@ def _save_model_outputs(self, logs): for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: collections_to_save = self._get_collections_to_save_for_step() - collections_to_save = self._get_collections_with_tensor( - export_names[key] - ).intersection(collections_to_save) - - self._initialize_writers(only_initialize_if_missing=True) - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: + output_collection = self.get_collection(CollectionKeys.OUTPUTS) + if output_collection in collections_to_save: + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append((tensor_ref, t)) + else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, logs[key])) + tensor_refs.append((tensor_ref, logs[key])) - for collection in collections_to_save: for tensor_ref, t in tensor_refs: - collection.set_tensor_ref(tensor_ref) + output_collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) if export_names[key] not in self.tensor_to_collections: - self.tensor_to_collections[export_names[key]] = {collection} - else: - self.tensor_to_collections[export_names[key]].add(collection) + self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -619,14 +615,13 @@ def _on_any_batch_end(self, batch, mode, logs=None): if not is_tf_version_2x() or (is_tf_version_2x() and not tf.executing_eagerly()): self._remove_fetches_and_callbacks(mode) + self._save_tensors_post_step(batch, logs) if is_tf_version_2x() and tf.executing_eagerly(): # Need to prepare non layer tensors again since # some tensors only become available on batch end self._prepare_non_layer_tensors() self._write_optimizer_variables() - self._save_tensors_post_step(batch, logs) - if self._prepared_tensors[mode]: if self._exported_collections is False: # in keras, these collections change when mode changes From 960d38305a99b57144d7934b27dd1de770a83e46 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 03:50:16 -0700 Subject: [PATCH 024/149] add to all --- smdebug/tensorflow/keras.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ad6a6e121..feef494e9 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -379,6 +379,10 @@ def _save_model_outputs(self, logs): collections_to_save = self._get_collections_to_save_for_step() output_collection = self.get_collection(CollectionKeys.OUTPUTS) if output_collection in collections_to_save: + collections_to_write = {output_collection} + for collection in collections_to_save: + if match_inc(export_names[key], collection.include_regex): + collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] tensor_refs = [] @@ -391,10 +395,9 @@ def _save_model_outputs(self, logs): tensor_refs.append((tensor_ref, logs[key])) for tensor_ref, t in tensor_refs: - output_collection.set_tensor_ref(tensor_ref) + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) - if export_names[key] not in self.tensor_to_collections: - self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 67f4efc82d2b2a635dcb9ff657fa5dd81c98479b Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 04:47:34 -0700 Subject: [PATCH 025/149] helper fn --- smdebug/core/save_config.py | 4 +--- smdebug/tensorflow/keras.py | 7 ++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/smdebug/core/save_config.py b/smdebug/core/save_config.py index e4758df94..5955c3d26 100644 --- a/smdebug/core/save_config.py +++ b/smdebug/core/save_config.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Union # First Party -from smdebug.core.modes import ModeKeys, str_to_mode_keys +from smdebug.core.modes import ModeKeys from smdebug.core.utils import step_in_range DEFAULT_SAVE_CONFIG_INTERVAL = 500 @@ -83,8 +83,6 @@ def set_save_config(self, mode: ModeKeys, save_config_mode: "SaveConfigMode") -> self.mode_save_configs[mode] = save_config_mode def should_save_step(self, mode, step_num) -> bool: - if isinstance(mode, str): - mode = str_to_mode_keys(mode) return self.get_save_config(mode).should_save_step(step_num) def to_json_dict(self) -> Dict: diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index feef494e9..292a86ea3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -6,7 +6,7 @@ from tensorflow.python.distribute import values # First Party -from smdebug.core.modes import ModeKeys +from smdebug.core.modes import ModeKeys, str_to_mode_keys from smdebug.core.utils import match_inc from smdebug.tensorflow.callable_cache import CallableCache @@ -96,6 +96,11 @@ def _is_not_supported(self): self._hook_supported = False return not self._hook_supported + def should_save_global_step_for_mode(self, mode: str): + mode = str_to_mode_keys(mode) + mode_step = self.mode_steps[mode] + return self.save_config.should_save_step(mode, mode_step) + def _get_matching_collections( self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False ): From 2df341ea9bb2c4b21313756b8ba169dbf04aaa1a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:05:52 -0700 Subject: [PATCH 026/149] helper fn --- smdebug/tensorflow/keras.py | 10 ++++++++-- smdebug/tensorflow/utils.py | 5 +++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 292a86ea3..26cecbfee 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -16,6 +16,7 @@ from .tensor_ref import TensorRef, get_tf_names from .utils import ( ModelOutput, + ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, get_keras_layer_inputs, @@ -377,7 +378,12 @@ def _save_model_outputs(self, logs): if logs is None: return - export_names = {ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y"} + export_names = { + ModelOutput.Y_PRED: "y_pred", + ModelOutput.Y: "y", + ModelOutput.VAL_Y: "val_y", + ModelOutput.VAL_Y_PRED: "val_y_pred", + } for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: @@ -413,7 +419,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: + if key in ["loss", "val_loss", "outputs"].extend(ModelOutputs): # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 3c8e8176e..43e41356f 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -15,6 +15,11 @@ class ModelOutput: Y = "smdebug_y" Y_PRED = "smdebug_y_pred" + VAL_Y = "val_smdebug_y" + VAL_Y_PRED = "val_smdebug_y" + + +ModelOutputs = [ModelOutput.Y, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] class TFDistributionStrategy(Enum): From 94765d29275999f95f25608d76bf674d6c97ac8f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:15:18 -0700 Subject: [PATCH 027/149] extend returns none --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 26cecbfee..4b4bc7845 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -419,7 +419,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"].extend(ModelOutputs): + if key in ["loss", "val_loss", "outputs"] + ModelOutputs: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) From 9eff79bd58f03e3f30da98cbfe7222e11f63f403 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:19:12 -0700 Subject: [PATCH 028/149] ypred --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 43e41356f..9c6e2d429 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -19,7 +19,7 @@ class ModelOutput: VAL_Y_PRED = "val_smdebug_y" -ModelOutputs = [ModelOutput.Y, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] +ModelOutputs = [ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] class TFDistributionStrategy(Enum): From 61d94e191a7e62e1fe0d7760829cc0fa5bb77d90 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:20:04 -0700 Subject: [PATCH 029/149] ypred --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 9c6e2d429..6eed8ee44 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -16,7 +16,7 @@ class ModelOutput: Y = "smdebug_y" Y_PRED = "smdebug_y_pred" VAL_Y = "val_smdebug_y" - VAL_Y_PRED = "val_smdebug_y" + VAL_Y_PRED = "val_smdebug_y_pred" ModelOutputs = [ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] From 07d72d3df6caf16e5995cbfb390eb6d5bfadffdf Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:33:06 -0700 Subject: [PATCH 030/149] change assert --- tests/tensorflow2/test_keras.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 543829f6a..ab4c20a69 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -438,14 +438,12 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in ["train_output/y", "train_output/y_pred"] + assert tname in ["y", "y_pred"] assert output.value(0) is not None assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors - assert trial.tensor("train_output/y").value(0).shape[1] == 1 # label - assert ( - trial.tensor("train_output/y_pred").value(0).shape[1] == 10 - ) # Output probability for each class + assert trial.tensor("y").value(0).shape[1] == 1 # label + assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From d8a8ea9c99d077a4083360479585a12fccf0d0cd Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 09:53:12 -0700 Subject: [PATCH 031/149] init --- smdebug/tensorflow/keras.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4b4bc7845..2b73bd18c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -355,9 +355,31 @@ def _prepare_tensors_for_step(self, mode): non_input_tensors = set(coll.get_tensors(mode=mode)).difference(input_tensors_set) self.tensor_refs_to_save_this_step.update(non_input_tensors) - def _save_inputs(self, check_before_write=True): - # TODO - pass + def _save_inputs(self, logs): + for key in logs: + if key in ["smdebug_input"]: + collections_to_save = self._get_collections_to_save_for_step() + input_collection = self.get_collection(CollectionKeys.INPUTS) + if input_collection in collections_to_save: + collections_to_write = {input_collection} + for collection in collections_to_save: + if match_inc(input_collection[key], collection.include_regex): + collections_to_write.add(collection) + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var("model_input") + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var("model_input") + tensor_refs.append((tensor_ref, logs[key])) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor("model_input", t, check_before_write=False) def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in self.tensor_to_collections: @@ -446,7 +468,7 @@ def _save_tensors_post_step(self, batch, logs): ) if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS): - self._save_inputs(check_before_write=False) + self._save_inputs(logs) def _get_exec_function(self, mode): # exec_function is None in 2.X; self.model exists but has no train_function, test_function, etc. From f7ead88fe2a71678e24a209b4c6c5f00e4baacef Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 10:25:25 -0700 Subject: [PATCH 032/149] do not match in metric --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2b73bd18c..e3d5a1e3e 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -357,7 +357,7 @@ def _prepare_tensors_for_step(self, mode): def _save_inputs(self, logs): for key in logs: - if key in ["smdebug_input"]: + if key in ["smdebug_model_input"]: collections_to_save = self._get_collections_to_save_for_step() input_collection = self.get_collection(CollectionKeys.INPUTS) if input_collection in collections_to_save: @@ -441,7 +441,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"] + ModelOutputs: + if key in ["loss", "val_loss", "outputs", "smdebug_model_input"] + ModelOutputs: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) From 6e24ca8e2453fc2bde0c95c6fe7a6f6f122d72da Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:22:51 -0700 Subject: [PATCH 033/149] update --- smdebug/tensorflow/keras.py | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e3d5a1e3e..65cfda4a6 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -363,23 +363,27 @@ def _save_inputs(self, logs): if input_collection in collections_to_save: collections_to_write = {input_collection} for collection in collections_to_save: - if match_inc(input_collection[key], collection.include_regex): + if match_inc(key, collection.include_regex): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var("model_input") - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var("model_input") - tensor_refs.append((tensor_ref, logs[key])) - - for tensor_ref, t in tensor_refs: - for collection in collections_to_write: - collection.set_tensor_ref(tensor_ref) - self._save_for_tensor("model_input", t, check_before_write=False) + for tensor_value in logs[key]: + tensor_id = 0 + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var( + f"model_input:{tensor_id}" + ) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(f"model_input:{tensor_id}") + tensor_refs.append((tensor_ref, tensor_value)) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor("model_input", t, check_before_write=False) + tensor_id += 1 def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in self.tensor_to_collections: @@ -459,6 +463,7 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) + self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -467,9 +472,6 @@ def _save_tensors_post_step(self, batch, logs): tensor_name=tensor.name, tensor_value=tensor.value(), check_before_write=False ) - if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS): - self._save_inputs(logs) - def _get_exec_function(self, mode): # exec_function is None in 2.X; self.model exists but has no train_function, test_function, etc. if self.distribution_strategy in [ From cda4e3e58989dc002a746f0ab4d54b997d4b9216 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:28:12 -0700 Subject: [PATCH 034/149] inputs --- smdebug/tensorflow/keras.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 65cfda4a6..c78213e9c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -327,6 +327,7 @@ def _prepare_non_layer_tensors(self): self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), self.get_collection(name=CollectionKeys.OUTPUTS), + self.get_collection(name=CollectionKeys.INPUTS), ]: for tensor_ref in coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: From 9b59d0de504290b7816cf146d55cf75690cc16ef Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:37:10 -0700 Subject: [PATCH 035/149] id --- smdebug/tensorflow/keras.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index c78213e9c..14e37a4aa 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -383,7 +383,9 @@ def _save_inputs(self, logs): for tensor_ref, t in tensor_refs: for collection in collections_to_write: collection.set_tensor_ref(tensor_ref) - self._save_for_tensor("model_input", t, check_before_write=False) + self._save_for_tensor( + f"model_input:{tensor_id}", t, check_before_write=False + ) tensor_id += 1 def _add_metric(self, metric_name, metric_value: tf.Tensor = None): From 9e5606e36be2713902a946f923545417b4be4b1f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 28 May 2020 23:06:31 -0700 Subject: [PATCH 036/149] save outputs --- smdebug/core/tfevent/event_file_reader.py | 1 + smdebug/tensorflow/base_hook.py | 2 +- smdebug/tensorflow/collection.py | 2 +- smdebug/tensorflow/keras.py | 19 ++++++++++++++++++- tests/tensorflow2/test_keras.py | 13 +++++++++++-- 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index c5f4b5fd9..176fb9e38 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,6 +38,7 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, + types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index baaa26476..a8cc9f679 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -437,7 +437,7 @@ def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ # this tensor_name is tf tensor name, need to convert to export_name tensor_ref = self._get_tensor_ref(tensor_name, save_collections=save_collections) - if tensor_ref: + if tensor_ref is not None: name = tensor_ref.export_name super()._write_for_tensor( name, tensor_value, save_collections=save_collections, tensor_ref=tensor_ref diff --git a/smdebug/tensorflow/collection.py b/smdebug/tensorflow/collection.py index 89df3916a..900716479 100644 --- a/smdebug/tensorflow/collection.py +++ b/smdebug/tensorflow/collection.py @@ -58,7 +58,7 @@ def add_distributed_variable(self, arg, export_name=None, mode=None): def add_aggregating_variable(self, arg, name=None, mode=None): return self.add_variable(arg.get(), name, mode=mode) - def add_tensor(self, arg, name=None, mode=None): + def add_tensor(self, arg, name=None, mode=None, type=None): # in keras we need to store the mode and only get tensors by mode return self._store_tensor_ref(TensorRef.from_tensor(arg, name, mode=mode)) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index fea005a31..a0629cf5a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -366,6 +366,21 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} + def _save_model_outputs(self, logs): + if logs is None: + return + + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): + export_names = {"y_pred": "train_output/y_pred", "y": "train_output/y"} + self._initialize_writers(only_initialize_if_missing=True) + output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) + for key in logs: + if key in ["y", "y_pred"]: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) + self.tensor_to_collections[export_names[key]] = {output_collection} + self._save_for_tensor(export_names[key], logs[key], check_before_write=False) + def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps if logs is None: @@ -375,7 +390,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"]: + if key in ["loss", "val_loss", "outputs", "y", "y_pred"]: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -392,6 +407,7 @@ def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) + self._save_model_outputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -479,6 +495,7 @@ def on_epoch_end(self, batch, logs=None): if self._is_not_supported(): return self._save_metrics(batch=batch, logs=logs, force_save=True) + self._save_model_outputs(logs=logs) self._close_writers() def _on_any_mode_begin(self, mode): diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ae972126a..16e6e3c47 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -412,9 +412,14 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (13 if is_tf_2_2() else 14) + assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) else: - assert len(trial.tensor_names()) == 21 + assert len(trial.tensor_names()) == 24 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert ( + len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.PREDICT)) == 0 + ) # bug: + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.TRAIN)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 @@ -427,6 +432,10 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): == 0, "No Optimizer Variables Should be Saved in EVAL Mode", ) + for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + output = trial.tensor(tname) + assert tname in ["train_output/y_pred", "train_output/y", "predict_output"] + assert output.value(0) is not None else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From 11ddcdd362a3933a3adcfa35d41c8a5c3da3b869 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 2 Jun 2020 23:26:14 -0700 Subject: [PATCH 037/149] assert updates --- tests/tensorflow2/test_keras.py | 6 +++--- tests/tensorflow2/test_keras_mirrored.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 16e6e3c47..4339dd39f 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -412,7 +412,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) + assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) else: assert len(trial.tensor_names()) == 24 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 @@ -593,7 +593,7 @@ def test_include_collections(out_dir, tf_eager_mode): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x if tf_eager_mode: - assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) + assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) else: assert len(trial.tensor_names()) == 18 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 @@ -654,7 +654,7 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) + assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index 3ff6f307a..d7630fb9f 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -256,8 +256,10 @@ def test_save_all(out_dir, tf_eager_mode, workers): tr = create_trial_fast_refresh(out_dir) print(tr.tensor_names()) if tf_eager_mode: - assert len(tr.tensor_names()) == (6 + 2 + 1 + 5 + 1 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1) - # weights, metrics, losses, optimizer variables, scalar + assert len(tr.tensor_names()) == ( + 6 + 2 + 1 + 5 + 1 + 2 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1 + 2 + ) + # weights, metrics, losses, optimizer variables, scalar, model outputs else: assert ( len(tr.tensor_names()) From 34d2294cdf5095086eb010ed87dd396ea35d6e74 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 2 Jun 2020 23:43:21 -0700 Subject: [PATCH 038/149] update assert --- tests/tensorflow2/test_keras_mirrored.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index d7630fb9f..f28deaadd 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -165,10 +165,10 @@ def exhaustive_check(trial_dir, include_workers="one", eager=True): assert len(tr.workers()) == strategy.num_replicas_in_sync if eager: assert len(tr.tensor_names()) == ( - 6 + 1 + 2 + 5 + 1 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 + 6 + 1 + 2 + 5 + 1 + 2 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 + 2 ) - # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar - # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar + # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar, 2 model outputs + # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar, 2 model outputs else: assert len(tr.tensor_names()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5) else: From f87ce0144df77fcc8b91b302de4f46e49f2ee3e6 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 3 Jun 2020 00:20:05 -0700 Subject: [PATCH 039/149] cleanup --- smdebug/core/tfevent/event_file_reader.py | 1 - smdebug/tensorflow/collection.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index 176fb9e38..c5f4b5fd9 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,7 +38,6 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, - types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/collection.py b/smdebug/tensorflow/collection.py index 900716479..89df3916a 100644 --- a/smdebug/tensorflow/collection.py +++ b/smdebug/tensorflow/collection.py @@ -58,7 +58,7 @@ def add_distributed_variable(self, arg, export_name=None, mode=None): def add_aggregating_variable(self, arg, name=None, mode=None): return self.add_variable(arg.get(), name, mode=mode) - def add_tensor(self, arg, name=None, mode=None, type=None): + def add_tensor(self, arg, name=None, mode=None): # in keras we need to store the mode and only get tensors by mode return self._store_tensor_ref(TensorRef.from_tensor(arg, name, mode=mode)) From bbb0dc624af3786c41b2bc61e3138554523be8e6 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 3 Jun 2020 02:47:30 -0700 Subject: [PATCH 040/149] as_dtype: --- smdebug/core/tfevent/event_file_reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index c5f4b5fd9..176fb9e38 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -38,6 +38,7 @@ def as_dtype(t): types_pb2.DT_INT64: np.int64, types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, + types_pb2.DT_UINT8: np.uint8, } return _INTERN_TABLE[t] From 82f0531eb8ab7dccc47dd5ec5fa708540b7f59b8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 01:13:32 -0700 Subject: [PATCH 041/149] model outputs are now constants --- smdebug/tensorflow/keras.py | 11 +++++++---- smdebug/tensorflow/utils.py | 5 +++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a0629cf5a..84fa8e613 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -15,6 +15,7 @@ from .collection import CollectionKeys from .tensor_ref import TensorRef, get_tf_names from .utils import ( + ModelOutput, TFDistributionStrategy, get_export_name_for_keras, get_keras_layer_inputs, @@ -371,11 +372,14 @@ def _save_model_outputs(self, logs): return if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): - export_names = {"y_pred": "train_output/y_pred", "y": "train_output/y"} + export_names = { + ModelOutput.Y_PRED: "train_output/y_pred", + ModelOutput.Y: "train_output/y", + } self._initialize_writers(only_initialize_if_missing=True) output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: - if key in ["y", "y_pred"]: + if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} @@ -390,7 +394,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs", "y", "y_pred"]: + if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -495,7 +499,6 @@ def on_epoch_end(self, batch, logs=None): if self._is_not_supported(): return self._save_metrics(batch=batch, logs=logs, force_save=True) - self._save_model_outputs(logs=logs) self._close_writers() def _on_any_mode_begin(self, mode): diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c58469023..5bd94329b 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -12,6 +12,11 @@ from smdebug.core.modes import ModeKeys +class ModelOutput: + Y = "y" + Y_PRED = "y_pred" + + class TFDistributionStrategy(Enum): NONE = 0 HOROVOD = 1 From 4663370a2ec248052431870fd7178936cfc6939d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 02:40:13 -0700 Subject: [PATCH 042/149] update to test --- tests/tensorflow2/test_keras.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4339dd39f..38e238231 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -22,6 +22,7 @@ from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.utils import ModelOutput from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig @@ -396,7 +397,8 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow @pytest.mark.parametrize("saveall", [True, False]) def test_keras_fit(out_dir, tf_eager_mode, saveall): - hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) + save_config = SaveConfig(save_interval=1) if saveall else None + hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=save_config) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) scalars_to_be_saved = dict() @@ -411,6 +413,9 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 20 + assert len(trial.steps(mode=ModeKeys.EVAL)) == 10 + assert len(trial.steps(mode=ModeKeys.PREDICT)) == 4 if tf_eager_mode: assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) else: @@ -434,8 +439,14 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in ["train_output/y_pred", "train_output/y", "predict_output"] + assert tname in [ModelOutput.Y_PRED, ModelOutput.Y] assert output.value(0) is not None + assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) + # Check the shape of output tensors + assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label + assert ( + trial.tensor(ModelOutput.Y_PRED).value(0).shape[1] == 10 + ) # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From c64a7a135b36b45a633869330dbbabc7892d9d95 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 02:41:37 -0700 Subject: [PATCH 043/149] update import statement --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 38e238231..4227046d8 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -22,9 +22,9 @@ from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS -from smdebug.core.utils import ModelOutput from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig +from smdebug.tensorflow.utils import ModelOutput def helper_keras_fit( From 15c1d61e107ef9e69fb6371969fafc7da2807c70 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 4 Jun 2020 22:45:52 -0700 Subject: [PATCH 044/149] tmp --- smdebug/tensorflow/keras.py | 44 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 84fa8e613..ee1b97cd3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -352,38 +352,36 @@ def _save_inputs(self, check_before_write=True): # TODO pass - def _add_metric(self, metric_name, metric_value: tf.Tensor = None): - if metric_name in self.tensor_to_collections: + def _save_tensor(self, t_name: str, t_value: tf.Tensor, collection: CollectionKeys): + if t_name in self.tensor_to_collections: return + coll = self.collection_manager.get(collection) + if isinstance(t_value, tf.Tensor): + coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) + else: + coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) + self.tensor_to_collections[t_name] = {coll} + self._initialize_writers(only_initialize_if_missing=True) + self._save_for_tensor(t_name, t_value, check_before_write=False) + def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in ["loss", "val_loss"]: coll_name = CollectionKeys.LOSSES else: coll_name = CollectionKeys.METRICS - coll = self.collection_manager.get(coll_name) - if metric_value: - coll.set_tensor_ref(metric_value, metric_name) - else: - coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) - self.tensor_to_collections[metric_name] = {coll} + self._save_tensor(metric_name, metric_value, coll_name) def _save_model_outputs(self, logs): if logs is None: return - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): export_names = { ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y", } - self._initialize_writers(only_initialize_if_missing=True) - output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - output_collection.set_tensor_ref(tensor_ref) - self.tensor_to_collections[export_names[key]] = {output_collection} - self._save_for_tensor(export_names[key], logs[key], check_before_write=False) + self._save_tensor(export_names[key], logs[key], CollectionKeys.OUTPUTS) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -391,27 +389,33 @@ def _save_metrics(self, batch, logs, force_save=False): return if force_save or self._is_collection_being_saved_for_step(CollectionKeys.METRICS): - self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue - self._add_metric(metric_name=key) - self._save_for_tensor(key, logs[key], check_before_write=False) + self._add_metric(metric_name=key, metric_value=logs[key]) if force_save or self._is_collection_being_saved_for_step(CollectionKeys.LOSSES): self._initialize_writers(only_initialize_if_missing=True) for key in ["loss", "val_loss"]: if key in logs: - self._add_metric(metric_name=key) - self._save_for_tensor(key, logs[key], check_before_write=False) + self._add_metric(metric_name=key, metric_value=logs[key]) + + def _save_gradients(self, logs): + if logs is None: + return + + if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): + if "gradients" in logs: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) + self._save_gradients(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From be6186fbeb21de1bbf485ff10605b0c29e7595e2 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 7 Jun 2020 23:07:32 -0700 Subject: [PATCH 045/149] Revert "tmp" This reverts commit 5fd3a7445fa555c1525787e38b7cdde00d88cdd4. --- smdebug/tensorflow/keras.py | 44 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ee1b97cd3..84fa8e613 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -352,36 +352,38 @@ def _save_inputs(self, check_before_write=True): # TODO pass - def _save_tensor(self, t_name: str, t_value: tf.Tensor, collection: CollectionKeys): - if t_name in self.tensor_to_collections: + def _add_metric(self, metric_name, metric_value: tf.Tensor = None): + if metric_name in self.tensor_to_collections: return - coll = self.collection_manager.get(collection) - if isinstance(t_value, tf.Tensor): - coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) - else: - coll.set_tensor_ref(TensorRef.from_non_graph_var(t_name)) - self.tensor_to_collections[t_name] = {coll} - self._initialize_writers(only_initialize_if_missing=True) - self._save_for_tensor(t_name, t_value, check_before_write=False) - def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in ["loss", "val_loss"]: coll_name = CollectionKeys.LOSSES else: coll_name = CollectionKeys.METRICS - self._save_tensor(metric_name, metric_value, coll_name) + coll = self.collection_manager.get(coll_name) + if metric_value: + coll.set_tensor_ref(metric_value, metric_name) + else: + coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) + self.tensor_to_collections[metric_name] = {coll} def _save_model_outputs(self, logs): if logs is None: return + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): export_names = { ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y", } + self._initialize_writers(only_initialize_if_missing=True) + output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - self._save_tensor(export_names[key], logs[key], CollectionKeys.OUTPUTS) + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) + self.tensor_to_collections[export_names[key]] = {output_collection} + self._save_for_tensor(export_names[key], logs[key], check_before_write=False) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -389,33 +391,27 @@ def _save_metrics(self, batch, logs, force_save=False): return if force_save or self._is_collection_being_saved_for_step(CollectionKeys.METRICS): + self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: # outputs is saved differently through outputs collection continue - self._add_metric(metric_name=key, metric_value=logs[key]) + self._add_metric(metric_name=key) + self._save_for_tensor(key, logs[key], check_before_write=False) if force_save or self._is_collection_being_saved_for_step(CollectionKeys.LOSSES): self._initialize_writers(only_initialize_if_missing=True) for key in ["loss", "val_loss"]: if key in logs: - self._add_metric(metric_name=key, metric_value=logs[key]) - - def _save_gradients(self, logs): - if logs is None: - return - - if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): - if "gradients" in logs: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + self._add_metric(metric_name=key) + self._save_for_tensor(key, logs[key], check_before_write=False) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) - self._save_gradients(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From ae8f96bc49fb0d3095920bbcad10f431fc0181d4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 7 Jun 2020 23:08:13 -0700 Subject: [PATCH 046/149] str_to_mode --- smdebug/core/modes.py | 13 +++++++++++++ smdebug/core/save_config.py | 4 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/smdebug/core/modes.py b/smdebug/core/modes.py index fce21ad1d..809df1ce9 100644 --- a/smdebug/core/modes.py +++ b/smdebug/core/modes.py @@ -14,3 +14,16 @@ class ModeKeys(Enum): ALLOWED_MODE_NAMES = [x.name for x in ALLOWED_MODES] MODE_STEP_PLUGIN_NAME = "mode_step" MODE_PLUGIN_NAME = "mode" + + +def str_to_mode_keys(s): + if s == "train": + return ModeKeys.TRAIN + elif s == "eval": + return ModeKeys.EVAL + elif s == "predict": + return ModeKeys.PREDICT + elif s == "global": + return ModeKeys.GLOBAL + else: + raise Exception("Invalid mode") diff --git a/smdebug/core/save_config.py b/smdebug/core/save_config.py index 5955c3d26..e4758df94 100644 --- a/smdebug/core/save_config.py +++ b/smdebug/core/save_config.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Union # First Party -from smdebug.core.modes import ModeKeys +from smdebug.core.modes import ModeKeys, str_to_mode_keys from smdebug.core.utils import step_in_range DEFAULT_SAVE_CONFIG_INTERVAL = 500 @@ -83,6 +83,8 @@ def set_save_config(self, mode: ModeKeys, save_config_mode: "SaveConfigMode") -> self.mode_save_configs[mode] = save_config_mode def should_save_step(self, mode, step_num) -> bool: + if isinstance(mode, str): + mode = str_to_mode_keys(mode) return self.get_save_config(mode).should_save_step(step_num) def to_json_dict(self) -> Dict: From 30bd425178f9e147bba9f45f671f40ed2a7a33bb Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 00:25:12 -0700 Subject: [PATCH 047/149] add tensor --- smdebug/tensorflow/keras.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 84fa8e613..da2b2997d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -380,8 +380,12 @@ def _save_model_outputs(self, logs): output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - output_collection.set_tensor_ref(tensor_ref) + tensor_value = logs[key] + if isinstance(tensor_value, values.PerReplica): + output_collection.add(tensor_value) + else: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} self._save_for_tensor(export_names[key], logs[key], check_before_write=False) From 1e7aa1b1a05e751f1d603929c252d745fd7b727f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 00:29:44 -0700 Subject: [PATCH 048/149] add tensor --- smdebug/tensorflow/keras.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index da2b2997d..4a1e78903 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -382,7 +382,9 @@ def _save_model_outputs(self, logs): if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_value = logs[key] if isinstance(tensor_value, values.PerReplica): - output_collection.add(tensor_value) + output_collection.add_distributed_variable( + tensor_value, export_names=export_names[key] + ) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) output_collection.set_tensor_ref(tensor_ref) From 85ea95a050b65462746ce5e7830908870d588da4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:18:30 -0700 Subject: [PATCH 049/149] add dist tensor: --- smdebug/tensorflow/keras.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4a1e78903..15556ff3d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -381,12 +381,15 @@ def _save_model_outputs(self, logs): for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: tensor_value = logs[key] + tensor_refs = [] if isinstance(tensor_value, values.PerReplica): - output_collection.add_distributed_variable( - tensor_value, export_names=export_names[key] - ) + for t in tensor_value: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append(tensor_ref) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append(tensor_ref) + for tensor_ref in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} self._save_for_tensor(export_names[key], logs[key], check_before_write=False) From 95b8bccce0261c7cd1e47b16bb111cd1a52413d2 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:27:09 -0700 Subject: [PATCH 050/149] add tensor --- smdebug/tensorflow/keras.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 15556ff3d..1e8b82f09 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -383,16 +383,19 @@ def _save_model_outputs(self, logs): tensor_value = logs[key] tensor_refs = [] if isinstance(tensor_value, values.PerReplica): - for t in tensor_value: + for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) tensor_refs.append(tensor_ref) + self._save_for_tensor(export_names[key], t, check_before_write=False) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) tensor_refs.append(tensor_ref) + self._save_for_tensor( + export_names[key], logs[key], check_before_write=False + ) for tensor_ref in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self.tensor_to_collections[export_names[key]] = {output_collection} - self._save_for_tensor(export_names[key], logs[key], check_before_write=False) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 07fd3990cb0edddecd13a33d67db288028432ccd Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:30:32 -0700 Subject: [PATCH 051/149] for-loop --- smdebug/tensorflow/keras.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1e8b82f09..729048be6 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -385,16 +385,14 @@ def _save_model_outputs(self, logs): if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref) - self._save_for_tensor(export_names[key], t, check_before_write=False) + tensor_refs.append(tensor_ref, t) + else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref) - self._save_for_tensor( - export_names[key], logs[key], check_before_write=False - ) - for tensor_ref in tensor_refs: + tensor_refs.append(tensor_ref, logs[key]) + for tensor_ref, t in tensor_refs: output_collection.set_tensor_ref(tensor_ref) + self._save_for_tensor(export_names[key], t, check_before_write=False) self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): From 7151978787166527d768c68ef5597b8765add8f1 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:41:22 -0700 Subject: [PATCH 052/149] fix append --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 729048be6..dcfb029a3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -385,11 +385,11 @@ def _save_model_outputs(self, logs): if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref, t) + tensor_refs.append((tensor_ref, t)) else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append(tensor_ref, logs[key]) + tensor_refs.append((tensor_ref, logs[key])) for tensor_ref, t in tensor_refs: output_collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) From 72a72561b4bd1fd2f02c5989b4fd010443ed111f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:51:46 -0700 Subject: [PATCH 053/149] fix assert --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4227046d8..be5ce5df5 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -439,7 +439,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in [ModelOutput.Y_PRED, ModelOutput.Y] + assert tname in ["train_output/y", "train_output/y_pred"] assert output.value(0) is not None assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors From 046d165530335d69010709c42b2a30ad070c5bc5 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 01:59:49 -0700 Subject: [PATCH 054/149] add --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index be5ce5df5..368f7f6ed 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -441,7 +441,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): output = trial.tensor(tname) assert tname in ["train_output/y", "train_output/y_pred"] assert output.value(0) is not None - assert output.steps() == trial.steps(mode=ModeKeys.TRAIN) + assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label assert ( From 070cd6f2824007750d07cdb92a006f3dc082b539 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 02:02:54 -0700 Subject: [PATCH 055/149] model output --- tests/tensorflow2/test_keras.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 368f7f6ed..543829f6a 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -24,7 +24,6 @@ from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig -from smdebug.tensorflow.utils import ModelOutput def helper_keras_fit( @@ -443,9 +442,9 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): assert output.value(0) is not None assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors - assert trial.tensor(ModelOutput.Y).value(0).shape[1] == 1 # label + assert trial.tensor("train_output/y").value(0).shape[1] == 1 # label assert ( - trial.tensor(ModelOutput.Y_PRED).value(0).shape[1] == 10 + trial.tensor("train_output/y_pred").value(0).shape[1] == 10 ) # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) From 8af4ce8a1d7d7abfec5ec17caa0896c0e6279de4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 8 Jun 2020 02:36:27 -0700 Subject: [PATCH 056/149] rename --- smdebug/tensorflow/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 5bd94329b..3c8e8176e 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -13,8 +13,8 @@ class ModelOutput: - Y = "y" - Y_PRED = "y_pred" + Y = "smdebug_y" + Y_PRED = "smdebug_y_pred" class TFDistributionStrategy(Enum): From 1761ca2253b8a6346582f82f8c3d4962fc545209 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 02:51:31 -0700 Subject: [PATCH 057/149] add to all collections --- smdebug/tensorflow/keras.py | 43 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index dcfb029a3..141c4aa55 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -371,29 +371,34 @@ def _save_model_outputs(self, logs): if logs is None: return - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS): - export_names = { - ModelOutput.Y_PRED: "train_output/y_pred", - ModelOutput.Y: "train_output/y", - } - self._initialize_writers(only_initialize_if_missing=True) - output_collection = self.collection_manager.get(CollectionKeys.OUTPUTS) - for key in logs: - if key in [ModelOutput.Y, ModelOutput.Y_PRED]: - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, t)) + export_names = {ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y"} - else: + for key in logs: + if key in [ModelOutput.Y, ModelOutput.Y_PRED]: + collections_to_save = self._get_collections_to_save_for_step() + collections_to_save = self._get_collections_with_tensor( + export_names[key] + ).intersection(collections_to_save) + + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, logs[key])) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append((tensor_ref, logs[key])) + + for collection in collections_to_save: for tensor_ref, t in tensor_refs: - output_collection.set_tensor_ref(tensor_ref) + collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) - self.tensor_to_collections[export_names[key]] = {output_collection} + if export_names[key] not in self.tensor_to_collections: + self.tensor_to_collections[export_names[key]] = {collection} + else: + self.tensor_to_collections[export_names[key]].add(collection) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 6b581bff0270215600c04ca9e59eff0f1ab8acc6 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 03:18:23 -0700 Subject: [PATCH 058/149] revert --- smdebug/tensorflow/keras.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 141c4aa55..ad6a6e121 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -320,6 +320,7 @@ def _prepare_non_layer_tensors(self): for coll in [ self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), + self.get_collection(name=CollectionKeys.OUTPUTS), ]: for tensor_ref in coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: @@ -376,29 +377,24 @@ def _save_model_outputs(self, logs): for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: collections_to_save = self._get_collections_to_save_for_step() - collections_to_save = self._get_collections_with_tensor( - export_names[key] - ).intersection(collections_to_save) - - self._initialize_writers(only_initialize_if_missing=True) - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: + output_collection = self.get_collection(CollectionKeys.OUTPUTS) + if output_collection in collections_to_save: + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_refs.append((tensor_ref, t)) + else: tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) - tensor_refs.append((tensor_ref, logs[key])) + tensor_refs.append((tensor_ref, logs[key])) - for collection in collections_to_save: for tensor_ref, t in tensor_refs: - collection.set_tensor_ref(tensor_ref) + output_collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) if export_names[key] not in self.tensor_to_collections: - self.tensor_to_collections[export_names[key]] = {collection} - else: - self.tensor_to_collections[export_names[key]].add(collection) + self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -619,14 +615,13 @@ def _on_any_batch_end(self, batch, mode, logs=None): if not is_tf_version_2x() or (is_tf_version_2x() and not tf.executing_eagerly()): self._remove_fetches_and_callbacks(mode) + self._save_tensors_post_step(batch, logs) if is_tf_version_2x() and tf.executing_eagerly(): # Need to prepare non layer tensors again since # some tensors only become available on batch end self._prepare_non_layer_tensors() self._write_optimizer_variables() - self._save_tensors_post_step(batch, logs) - if self._prepared_tensors[mode]: if self._exported_collections is False: # in keras, these collections change when mode changes From 6b14ee711c65461b2cc6903eb6f99020580c1be5 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 03:50:16 -0700 Subject: [PATCH 059/149] add to all --- smdebug/tensorflow/keras.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ad6a6e121..feef494e9 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -379,6 +379,10 @@ def _save_model_outputs(self, logs): collections_to_save = self._get_collections_to_save_for_step() output_collection = self.get_collection(CollectionKeys.OUTPUTS) if output_collection in collections_to_save: + collections_to_write = {output_collection} + for collection in collections_to_save: + if match_inc(export_names[key], collection.include_regex): + collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] tensor_refs = [] @@ -391,10 +395,9 @@ def _save_model_outputs(self, logs): tensor_refs.append((tensor_ref, logs[key])) for tensor_ref, t in tensor_refs: - output_collection.set_tensor_ref(tensor_ref) + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) self._save_for_tensor(export_names[key], t, check_before_write=False) - if export_names[key] not in self.tensor_to_collections: - self.tensor_to_collections[export_names[key]] = {output_collection} def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 5c89dff073a97b4ef2040eeb49c9e9d29fe08751 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 04:47:34 -0700 Subject: [PATCH 060/149] helper fn --- smdebug/core/save_config.py | 4 +--- smdebug/tensorflow/keras.py | 7 ++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/smdebug/core/save_config.py b/smdebug/core/save_config.py index e4758df94..5955c3d26 100644 --- a/smdebug/core/save_config.py +++ b/smdebug/core/save_config.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Union # First Party -from smdebug.core.modes import ModeKeys, str_to_mode_keys +from smdebug.core.modes import ModeKeys from smdebug.core.utils import step_in_range DEFAULT_SAVE_CONFIG_INTERVAL = 500 @@ -83,8 +83,6 @@ def set_save_config(self, mode: ModeKeys, save_config_mode: "SaveConfigMode") -> self.mode_save_configs[mode] = save_config_mode def should_save_step(self, mode, step_num) -> bool: - if isinstance(mode, str): - mode = str_to_mode_keys(mode) return self.get_save_config(mode).should_save_step(step_num) def to_json_dict(self) -> Dict: diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index feef494e9..292a86ea3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -6,7 +6,7 @@ from tensorflow.python.distribute import values # First Party -from smdebug.core.modes import ModeKeys +from smdebug.core.modes import ModeKeys, str_to_mode_keys from smdebug.core.utils import match_inc from smdebug.tensorflow.callable_cache import CallableCache @@ -96,6 +96,11 @@ def _is_not_supported(self): self._hook_supported = False return not self._hook_supported + def should_save_global_step_for_mode(self, mode: str): + mode = str_to_mode_keys(mode) + mode_step = self.mode_steps[mode] + return self.save_config.should_save_step(mode, mode_step) + def _get_matching_collections( self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False ): From cc135667c6116e68eb90bbac263a84151149e0ca Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:05:52 -0700 Subject: [PATCH 061/149] helper fn --- smdebug/tensorflow/keras.py | 10 ++++++++-- smdebug/tensorflow/utils.py | 5 +++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 292a86ea3..26cecbfee 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -16,6 +16,7 @@ from .tensor_ref import TensorRef, get_tf_names from .utils import ( ModelOutput, + ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, get_keras_layer_inputs, @@ -377,7 +378,12 @@ def _save_model_outputs(self, logs): if logs is None: return - export_names = {ModelOutput.Y_PRED: "train_output/y_pred", ModelOutput.Y: "train_output/y"} + export_names = { + ModelOutput.Y_PRED: "y_pred", + ModelOutput.Y: "y", + ModelOutput.VAL_Y: "val_y", + ModelOutput.VAL_Y_PRED: "val_y_pred", + } for key in logs: if key in [ModelOutput.Y, ModelOutput.Y_PRED]: @@ -413,7 +419,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs", ModelOutput.Y, ModelOutput.Y_PRED]: + if key in ["loss", "val_loss", "outputs"].extend(ModelOutputs): # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 3c8e8176e..43e41356f 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -15,6 +15,11 @@ class ModelOutput: Y = "smdebug_y" Y_PRED = "smdebug_y_pred" + VAL_Y = "val_smdebug_y" + VAL_Y_PRED = "val_smdebug_y" + + +ModelOutputs = [ModelOutput.Y, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] class TFDistributionStrategy(Enum): From d07dd47aaf0dd82d740fb75efbd91d4128515f32 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:15:18 -0700 Subject: [PATCH 062/149] extend returns none --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 26cecbfee..4b4bc7845 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -419,7 +419,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"].extend(ModelOutputs): + if key in ["loss", "val_loss", "outputs"] + ModelOutputs: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) From 766902aed8f578786609a9e59be8b05bee1eb71a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:19:12 -0700 Subject: [PATCH 063/149] ypred --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 43e41356f..9c6e2d429 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -19,7 +19,7 @@ class ModelOutput: VAL_Y_PRED = "val_smdebug_y" -ModelOutputs = [ModelOutput.Y, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] +ModelOutputs = [ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] class TFDistributionStrategy(Enum): From 4e1b8027a137eef76e6276cb0d8d09932f42b3e1 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:20:04 -0700 Subject: [PATCH 064/149] ypred --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 9c6e2d429..6eed8ee44 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -16,7 +16,7 @@ class ModelOutput: Y = "smdebug_y" Y_PRED = "smdebug_y_pred" VAL_Y = "val_smdebug_y" - VAL_Y_PRED = "val_smdebug_y" + VAL_Y_PRED = "val_smdebug_y_pred" ModelOutputs = [ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] From 578284686f092b8924f996c6e97b6e2ffa1f725d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 9 Jun 2020 05:33:06 -0700 Subject: [PATCH 065/149] change assert --- tests/tensorflow2/test_keras.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 543829f6a..ab4c20a69 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -438,14 +438,12 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): ) for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) - assert tname in ["train_output/y", "train_output/y_pred"] + assert tname in ["y", "y_pred"] assert output.value(0) is not None assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors - assert trial.tensor("train_output/y").value(0).shape[1] == 1 # label - assert ( - trial.tensor("train_output/y_pred").value(0).shape[1] == 10 - ) # Output probability for each class + assert trial.tensor("y").value(0).shape[1] == 1 # label + assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From 07c6e75024b1823c2f760e9ad652ca73021ece44 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 09:53:12 -0700 Subject: [PATCH 066/149] init --- smdebug/tensorflow/keras.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4b4bc7845..2b73bd18c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -355,9 +355,31 @@ def _prepare_tensors_for_step(self, mode): non_input_tensors = set(coll.get_tensors(mode=mode)).difference(input_tensors_set) self.tensor_refs_to_save_this_step.update(non_input_tensors) - def _save_inputs(self, check_before_write=True): - # TODO - pass + def _save_inputs(self, logs): + for key in logs: + if key in ["smdebug_input"]: + collections_to_save = self._get_collections_to_save_for_step() + input_collection = self.get_collection(CollectionKeys.INPUTS) + if input_collection in collections_to_save: + collections_to_write = {input_collection} + for collection in collections_to_save: + if match_inc(input_collection[key], collection.include_regex): + collections_to_write.add(collection) + self._initialize_writers(only_initialize_if_missing=True) + tensor_value = logs[key] + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var("model_input") + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var("model_input") + tensor_refs.append((tensor_ref, logs[key])) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor("model_input", t, check_before_write=False) def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in self.tensor_to_collections: @@ -446,7 +468,7 @@ def _save_tensors_post_step(self, batch, logs): ) if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS): - self._save_inputs(check_before_write=False) + self._save_inputs(logs) def _get_exec_function(self, mode): # exec_function is None in 2.X; self.model exists but has no train_function, test_function, etc. From 0d8c6cb11d77ef1611aaacff64e2335d30bf25ce Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 10:25:25 -0700 Subject: [PATCH 067/149] do not match in metric --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2b73bd18c..e3d5a1e3e 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -357,7 +357,7 @@ def _prepare_tensors_for_step(self, mode): def _save_inputs(self, logs): for key in logs: - if key in ["smdebug_input"]: + if key in ["smdebug_model_input"]: collections_to_save = self._get_collections_to_save_for_step() input_collection = self.get_collection(CollectionKeys.INPUTS) if input_collection in collections_to_save: @@ -441,7 +441,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs"] + ModelOutputs: + if key in ["loss", "val_loss", "outputs", "smdebug_model_input"] + ModelOutputs: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) From ae526c04e72130ca4e91708419c381f639712fe0 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:22:51 -0700 Subject: [PATCH 068/149] update --- smdebug/tensorflow/keras.py | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e3d5a1e3e..65cfda4a6 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -363,23 +363,27 @@ def _save_inputs(self, logs): if input_collection in collections_to_save: collections_to_write = {input_collection} for collection in collections_to_save: - if match_inc(input_collection[key], collection.include_regex): + if match_inc(key, collection.include_regex): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) - tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var("model_input") - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var("model_input") - tensor_refs.append((tensor_ref, logs[key])) - - for tensor_ref, t in tensor_refs: - for collection in collections_to_write: - collection.set_tensor_ref(tensor_ref) - self._save_for_tensor("model_input", t, check_before_write=False) + for tensor_value in logs[key]: + tensor_id = 0 + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var( + f"model_input:{tensor_id}" + ) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(f"model_input:{tensor_id}") + tensor_refs.append((tensor_ref, tensor_value)) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor("model_input", t, check_before_write=False) + tensor_id += 1 def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in self.tensor_to_collections: @@ -459,6 +463,7 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) + self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -467,9 +472,6 @@ def _save_tensors_post_step(self, batch, logs): tensor_name=tensor.name, tensor_value=tensor.value(), check_before_write=False ) - if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS): - self._save_inputs(logs) - def _get_exec_function(self, mode): # exec_function is None in 2.X; self.model exists but has no train_function, test_function, etc. if self.distribution_strategy in [ From bf82f9cd0415139ab7bd3f84316b94d542975083 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:28:12 -0700 Subject: [PATCH 069/149] inputs --- smdebug/tensorflow/keras.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 65cfda4a6..c78213e9c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -327,6 +327,7 @@ def _prepare_non_layer_tensors(self): self.get_collection(name=CollectionKeys.OPTIMIZER_VARIABLES), self.get_collection(name=CollectionKeys.GRADIENTS), self.get_collection(name=CollectionKeys.OUTPUTS), + self.get_collection(name=CollectionKeys.INPUTS), ]: for tensor_ref in coll.get_tensors(): if tensor_ref.name not in self.tensor_to_collections: From 101fcb2e18c6e8b307b5f20f85b20fb9f938580e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 10 Jun 2020 11:37:10 -0700 Subject: [PATCH 070/149] id --- smdebug/tensorflow/keras.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index c78213e9c..14e37a4aa 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -383,7 +383,9 @@ def _save_inputs(self, logs): for tensor_ref, t in tensor_refs: for collection in collections_to_write: collection.set_tensor_ref(tensor_ref) - self._save_for_tensor("model_input", t, check_before_write=False) + self._save_for_tensor( + f"model_input:{tensor_id}", t, check_before_write=False + ) tensor_id += 1 def _add_metric(self, metric_name, metric_value: tf.Tensor = None): From bc84269199a2ab70cbd64d882f3764da178c9625 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sat, 13 Jun 2020 00:34:36 -0700 Subject: [PATCH 071/149] test --- tests/tensorflow2/test_keras.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ab4c20a69..d3560cffb 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -394,8 +394,9 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow -@pytest.mark.parametrize("saveall", [True, False]) -def test_keras_fit(out_dir, tf_eager_mode, saveall): +@pytest.mark.parametrize("saveall", [True]) +def test_keras_fit(out_dir, saveall): + tf_eager_mode = True save_config = SaveConfig(save_interval=1) if saveall else None hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=save_config) ts = time.time() From 50914153db5eb6012c26122222352af0c16b45d4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 14:33:52 -0700 Subject: [PATCH 072/149] fuse model inputs and outputs --- smdebug/tensorflow/keras.py | 41 ++++++++++++++++++++++--------------- smdebug/tensorflow/utils.py | 23 ++++++++++++++++++++- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 14e37a4aa..57c2c0d96 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -15,6 +15,8 @@ from .collection import CollectionKeys from .tensor_ref import TensorRef, get_tf_names from .utils import ( + ModelInput, + ModelInputs, ModelOutput, ModelOutputs, TFDistributionStrategy, @@ -22,6 +24,8 @@ get_keras_layer_inputs, get_keras_layer_outputs, get_keras_mode, + get_model_input_export_name, + get_model_output_export_name, is_keras_optimizer, is_tf_version_2x, ) @@ -407,37 +411,39 @@ def _save_model_outputs(self, logs): if logs is None: return - export_names = { - ModelOutput.Y_PRED: "y_pred", - ModelOutput.Y: "y", - ModelOutput.VAL_Y: "val_y", - ModelOutput.VAL_Y_PRED: "val_y_pred", - } + model_input_tensor_id = 0 for key in logs: - if key in [ModelOutput.Y, ModelOutput.Y_PRED]: + if key in ModelOutputs.union(ModelInputs): collections_to_save = self._get_collections_to_save_for_step() - output_collection = self.get_collection(CollectionKeys.OUTPUTS) - if output_collection in collections_to_save: - collections_to_write = {output_collection} + if key in ModelOutputs: + key_collection = self.get_collection(CollectionKeys.OUTPUTS) + export_name = get_model_output_export_name(key) + else: + key_collection = self.get_collection(CollectionKeys.INPUTS) + export_name = get_model_input_export_name(model_input_tensor_id) + model_input_tensor_id += 1 + + if key_collection in collections_to_save: + collections_to_write = {key_collection} for collection in collections_to_save: - if match_inc(export_names[key], collection.include_regex): + if match_inc(export_name, collection.include_regex): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] tensor_refs = [] if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_ref = TensorRef.from_non_graph_var(export_name) tensor_refs.append((tensor_ref, t)) else: - tensor_ref = TensorRef.from_non_graph_var(export_names[key]) + tensor_ref = TensorRef.from_non_graph_var(export_name) tensor_refs.append((tensor_ref, logs[key])) for tensor_ref, t in tensor_refs: for collection in collections_to_write: collection.set_tensor_ref(tensor_ref) - self._save_for_tensor(export_names[key], t, check_before_write=False) + self._save_for_tensor(export_name, t, check_before_write=False) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -448,7 +454,10 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in ["loss", "val_loss", "outputs", "smdebug_model_input"] + ModelOutputs: + if ( + key + in ["loss", "val_loss", "outputs", "smdebug_model_input"] + ModelInputOutputs + ): # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -466,7 +475,7 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_outputs(logs) - self._save_inputs(logs) + # self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 6eed8ee44..b2f552ebe 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -19,7 +19,28 @@ class ModelOutput: VAL_Y_PRED = "val_smdebug_y_pred" -ModelOutputs = [ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED] +ModelOutputs = {ModelOutput.Y, ModelOutput.Y_PRED, ModelOutput.VAL_Y, ModelOutput.VAL_Y_PRED} + + +def get_model_output_export_name(key): + export_names = { + ModelOutput.Y_PRED: "y_pred", + ModelOutput.Y: "y", + ModelOutput.VAL_Y: "val_y", + ModelOutput.VAL_Y_PRED: "val_y_pred", + } + return export_names[key] + + +class ModelInput: + X = "smdebug_model_input" + + +ModelInputs = {ModelInput.X} + + +def get_model_input_export_name(tensor_id): + return f"model_input:{tensor_id}" class TFDistributionStrategy(Enum): From 13ce9886cd00bb754b9e0cc3a1007734e6d6d0a9 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 17:14:55 -0700 Subject: [PATCH 073/149] set fix --- smdebug/tensorflow/keras.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 57c2c0d96..24effd4c1 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -454,10 +454,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if ( - key - in ["loss", "val_loss", "outputs", "smdebug_model_input"] + ModelInputOutputs - ): + if key in {"loss", "val_loss", "outputs"}.union(ModelOutputs).union(ModelInputs): # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) From 460e0e0d7a1ef32028b587208296c6fe1bab738d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 22:19:53 -0700 Subject: [PATCH 074/149] add tests --- tests/tensorflow2/test_keras.py | 33 +++++++++++++++++++++--- tests/tensorflow2/test_keras_mirrored.py | 4 +-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index d3560cffb..a5288f0a5 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -395,8 +395,33 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow @pytest.mark.parametrize("saveall", [True]) -def test_keras_fit(out_dir, saveall): - tf_eager_mode = True +def test_model_inputs_and_outputs(out_dir, tf_eager_mode, saveall): + hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) + if saveall is False: + # explicitly save INPUTS and OUTPUTS + hook.get_collection(CollectionKeys.OUTPUTS) + hook.get_collection(CollectionKeys.INPUTS) + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + + trial = smd.create_trial(path=out_dir) + if saveall: + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 17 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + else: + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 17 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + + +@pytest.mark.slow +@pytest.mark.parametrize("saveall", [True]) +def test_keras_fit(out_dir, tf_eager_mode, saveall): save_config = SaveConfig(save_interval=1) if saveall else None hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=save_config) ts = time.time() @@ -417,7 +442,7 @@ def test_keras_fit(out_dir, saveall): assert len(trial.steps(mode=ModeKeys.EVAL)) == 10 assert len(trial.steps(mode=ModeKeys.PREDICT)) == 4 if tf_eager_mode: - assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) + assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) else: assert len(trial.tensor_names()) == 24 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 @@ -663,7 +688,7 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) + assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index f28deaadd..c62db5b0c 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -257,9 +257,9 @@ def test_save_all(out_dir, tf_eager_mode, workers): print(tr.tensor_names()) if tf_eager_mode: assert len(tr.tensor_names()) == ( - 6 + 2 + 1 + 5 + 1 + 2 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1 + 2 + 6 + 2 + 1 + 5 + 1 + 2 + 1 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1 + 2 + 1 ) - # weights, metrics, losses, optimizer variables, scalar, model outputs + # weights, metrics, losses, optimizer variables, scalar, model outputs, inputs else: assert ( len(tr.tensor_names()) From c20cc75432bee4f7eada53a1d86085e9fdb28344 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 22:30:34 -0700 Subject: [PATCH 075/149] update test --- tests/tensorflow2/test_keras.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index a5288f0a5..78ae43bdd 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -414,10 +414,19 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode, saveall): assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 else: - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 17 + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + output = trial.tensor(tname) + assert tname in ["y", "y_pred"] + assert output.value(0) is not None + assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) + # Check the shape of output tensors + assert trial.tensor("y").value(0).shape[1] == 1 # label + assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class + @pytest.mark.slow @pytest.mark.parametrize("saveall", [True]) @@ -462,14 +471,6 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): == 0, "No Optimizer Variables Should be Saved in EVAL Mode", ) - for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): - output = trial.tensor(tname) - assert tname in ["y", "y_pred"] - assert output.value(0) is not None - assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) - # Check the shape of output tensors - assert trial.tensor("y").value(0).shape[1] == 1 # label - assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class else: # save the default losses and metrics assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 From 5766aa29f44dbd259edc29a4acfbb37c47f7f135 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 22:47:30 -0700 Subject: [PATCH 076/149] eager mode --- tests/tensorflow2/test_keras.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 78ae43bdd..8890f8b2c 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -394,13 +394,11 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow -@pytest.mark.parametrize("saveall", [True]) -def test_model_inputs_and_outputs(out_dir, tf_eager_mode, saveall): - hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) - if saveall is False: - # explicitly save INPUTS and OUTPUTS - hook.get_collection(CollectionKeys.OUTPUTS) - hook.get_collection(CollectionKeys.INPUTS) +def test_model_inputs_and_outputs(out_dir, tf_eager_mode): + hook = smd.KerasHook(out_dir=out_dir) + # explicitly save INPUTS and OUTPUTS + hook.get_collection(CollectionKeys.OUTPUTS) + hook.get_collection(CollectionKeys.INPUTS) helper_keras_fit( trial_dir=out_dir, hook=hook, @@ -409,14 +407,9 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode, saveall): ) trial = smd.create_trial(path=out_dir) - if saveall: - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 17 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 - else: - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): output = trial.tensor(tname) From 0428d629252208c1db21171dd8bc0092723c06ce Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 23:25:26 -0700 Subject: [PATCH 077/149] update tests --- smdebug/tensorflow/keras.py | 32 -------------------------------- smdebug/tensorflow/utils.py | 2 +- tests/tensorflow2/test_keras.py | 7 +++---- 3 files changed, 4 insertions(+), 37 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 24effd4c1..d951c3791 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -360,38 +360,6 @@ def _prepare_tensors_for_step(self, mode): non_input_tensors = set(coll.get_tensors(mode=mode)).difference(input_tensors_set) self.tensor_refs_to_save_this_step.update(non_input_tensors) - def _save_inputs(self, logs): - for key in logs: - if key in ["smdebug_model_input"]: - collections_to_save = self._get_collections_to_save_for_step() - input_collection = self.get_collection(CollectionKeys.INPUTS) - if input_collection in collections_to_save: - collections_to_write = {input_collection} - for collection in collections_to_save: - if match_inc(key, collection.include_regex): - collections_to_write.add(collection) - self._initialize_writers(only_initialize_if_missing=True) - for tensor_value in logs[key]: - tensor_id = 0 - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var( - f"model_input:{tensor_id}" - ) - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var(f"model_input:{tensor_id}") - tensor_refs.append((tensor_ref, tensor_value)) - - for tensor_ref, t in tensor_refs: - for collection in collections_to_write: - collection.set_tensor_ref(tensor_ref) - self._save_for_tensor( - f"model_input:{tensor_id}", t, check_before_write=False - ) - tensor_id += 1 - def _add_metric(self, metric_name, metric_value: tf.Tensor = None): if metric_name in self.tensor_to_collections: return diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index b2f552ebe..0e29a28f7 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -33,7 +33,7 @@ def get_model_output_export_name(key): class ModelInput: - X = "smdebug_model_input" + X = "smdebug_x" ModelInputs = {ModelInput.X} diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 8890f8b2c..c3e83e059 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -395,10 +395,10 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow def test_model_inputs_and_outputs(out_dir, tf_eager_mode): - hook = smd.KerasHook(out_dir=out_dir) # explicitly save INPUTS and OUTPUTS - hook.get_collection(CollectionKeys.OUTPUTS) - hook.get_collection(CollectionKeys.INPUTS) + include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + helper_keras_fit( trial_dir=out_dir, hook=hook, @@ -415,7 +415,6 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode): output = trial.tensor(tname) assert tname in ["y", "y_pred"] assert output.value(0) is not None - assert output.steps(mode=ModeKeys.TRAIN) == trial.steps(mode=ModeKeys.TRAIN) # Check the shape of output tensors assert trial.tensor("y").value(0).shape[1] == 1 # label assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class From 54ad7a5ca9386c33c4154d23d64b6da02fc2eb79 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 23:31:48 -0700 Subject: [PATCH 078/149] rename fn --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d951c3791..717359a12 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -375,7 +375,7 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def _save_model_outputs(self, logs): + def _save_model_inputs_and_outputs(self, logs): if logs is None: return @@ -439,7 +439,7 @@ def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) - self._save_model_outputs(logs) + self._save_model_inputs_and_outputs(logs) # self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): From 40ded7748d2c9020f92f4f541f4f9c2324e5deeb Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Sun, 14 Jun 2020 23:40:27 -0700 Subject: [PATCH 079/149] remove unused imports --- smdebug/tensorflow/keras.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 717359a12..b240126b0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -15,9 +15,7 @@ from .collection import CollectionKeys from .tensor_ref import TensorRef, get_tf_names from .utils import ( - ModelInput, ModelInputs, - ModelOutput, ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, From 9ead6fad07e18372d6078466989c180420ae451e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 15 Jun 2020 23:08:32 -0700 Subject: [PATCH 080/149] save custom tensor fn --- smdebug/tensorflow/keras.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index b240126b0..20a3ee55f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -373,6 +373,21 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} + def save_tensor(self, tensor_name, tensor_value, collections_to_write): + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var(tensor_name) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(tensor_name) + tensor_refs.append((tensor_ref, tensor_value)) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor(tensor_name, t, check_before_write=False) + def _save_model_inputs_and_outputs(self, logs): if logs is None: return @@ -397,19 +412,7 @@ def _save_model_inputs_and_outputs(self, logs): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var(export_name) - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var(export_name) - tensor_refs.append((tensor_ref, logs[key])) - - for tensor_ref, t in tensor_refs: - for collection in collections_to_write: - collection.set_tensor_ref(tensor_ref) - self._save_for_tensor(export_name, t, check_before_write=False) + self.save_tensor(export_name, tensor_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -438,7 +441,6 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_inputs_and_outputs(logs) - # self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From c9a6198dc26bc2216bb6572beb55f87f72fc7e88 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 01:32:04 -0700 Subject: [PATCH 081/149] test_ custom tensors --- smdebug/core/tfevent/event_file_reader.py | 1 + smdebug/tensorflow/keras.py | 15 +++++++++++++-- tests/tensorflow2/test_keras.py | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index 176fb9e38..264e5c5cc 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -39,6 +39,7 @@ def as_dtype(t): types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, types_pb2.DT_UINT8: np.uint8, + types_pb2.DT_COMPLEX128: np.complex128, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 20a3ee55f..6f9d39df2 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ def __init__( self.tensor_refs_to_save_this_step = set() self._fetches_added = set() self.callable_cache = CallableCache() + self.custom_tensors_to_save = dict() def _is_not_supported(self): if self.distribution_strategy is None: @@ -373,7 +374,16 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_tensor(self, tensor_name, tensor_value, collections_to_write): + def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): + self.custom_tensors_to_save[tensor_name] = (tensor_value, collections_to_write) + + def _save_custom_tensors_post_step(self): + for tensor_name in self.custom_tensors_to_save: + tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] + collections = [self.get_collection(c) for c in collection_names] + self._save_tensor(tensor_name, tensor_value, collections) + + def _save_tensor(self, tensor_name, tensor_value, collections_to_write): tensor_refs = [] if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: @@ -412,7 +422,7 @@ def _save_model_inputs_and_outputs(self, logs): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] - self.save_tensor(export_name, tensor_value, collections_to_write) + self._save_tensor(export_name, tensor_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -441,6 +451,7 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_inputs_and_outputs(logs) + self._save_custom_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index c3e83e059..92025888b 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -393,6 +393,28 @@ def test_gradtape_persistent(out_dir, saveall): assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1 +def test_save_custom_tensors(out_dir, tf_eager_mode): + include_collections = ["custom_coll"] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + t1 = tf.constant([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) + t2 = tf.Variable([5 + 4j, 6 + 1j]) + t3 = tf.Variable([False, False, False, True]) + hook.save_custom_tensor("custom_tensor_1", t1, include_collections) + hook.save_custom_tensor("custom_tensor_2", t2, include_collections) + hook.save_custom_tensor("custom_tensor_3", t3, include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + for tname in trial.tensor_names(collection="custom_coll"): + assert trial.tensor(tname).value(0) is not None + + @pytest.mark.slow def test_model_inputs_and_outputs(out_dir, tf_eager_mode): # explicitly save INPUTS and OUTPUTS From 7c7fbb340f415e89711e038b6527135fdac16d7e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 01:52:33 -0700 Subject: [PATCH 082/149] revert tests --- tests/tensorflow2/test_keras.py | 75 +++++++++++------------- tests/tensorflow2/test_keras_mirrored.py | 12 ++-- 2 files changed, 38 insertions(+), 49 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index c3e83e059..5efff6758 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -394,37 +394,9 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow -def test_model_inputs_and_outputs(out_dir, tf_eager_mode): - # explicitly save INPUTS and OUTPUTS - include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] - hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) - - helper_keras_fit( - trial_dir=out_dir, - hook=hook, - eager=tf_eager_mode, - steps=["train", "eval", "predict", "train"], - ) - - trial = smd.create_trial(path=out_dir) - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 - - for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): - output = trial.tensor(tname) - assert tname in ["y", "y_pred"] - assert output.value(0) is not None - # Check the shape of output tensors - assert trial.tensor("y").value(0).shape[1] == 1 # label - assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class - - -@pytest.mark.slow -@pytest.mark.parametrize("saveall", [True]) +@pytest.mark.parametrize("saveall", [True, False]) def test_keras_fit(out_dir, tf_eager_mode, saveall): - save_config = SaveConfig(save_interval=1) if saveall else None - hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=save_config) + hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) scalars_to_be_saved = dict() @@ -439,18 +411,10 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 20 - assert len(trial.steps(mode=ModeKeys.EVAL)) == 10 - assert len(trial.steps(mode=ModeKeys.PREDICT)) == 4 if tf_eager_mode: - assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 17) + assert len(trial.tensor_names()) == (13 if is_tf_2_2() else 14) else: - assert len(trial.tensor_names()) == 24 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 - assert ( - len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.PREDICT)) == 0 - ) # bug: - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS, mode=ModeKeys.TRAIN)) == 2 + assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 @@ -620,7 +584,7 @@ def test_include_collections(out_dir, tf_eager_mode): trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x if tf_eager_mode: - assert len(trial.tensor_names()) == (14 if is_tf_2_2() else 15) + assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) else: assert len(trial.tensor_names()) == 18 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 @@ -681,7 +645,34 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names()) == (15 if is_tf_2_2() else 16) + assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 + + +@pytest.mark.skip # skip until aws tf update +def test_model_inputs_and_outputs(out_dir, tf_eager_mode): + # explicitly save INPUTS and OUTPUTS + include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + + trial = smd.create_trial(path=out_dir) + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + + for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + output = trial.tensor(tname) + assert tname in ["y", "y_pred"] + assert output.value(0) is not None + # Check the shape of output tensors + assert trial.tensor("y").value(0).shape[1] == 1 # label + assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index c62db5b0c..3ff6f307a 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -165,10 +165,10 @@ def exhaustive_check(trial_dir, include_workers="one", eager=True): assert len(tr.workers()) == strategy.num_replicas_in_sync if eager: assert len(tr.tensor_names()) == ( - 6 + 1 + 2 + 5 + 1 + 2 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 + 2 + 6 + 1 + 2 + 5 + 1 if is_tf_2_2() else 6 + 1 + 3 + 5 + 1 ) - # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar, 2 model outputs - # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar, 2 model outputs + # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar + # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.2, 1 scalar else: assert len(tr.tensor_names()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5) else: @@ -256,10 +256,8 @@ def test_save_all(out_dir, tf_eager_mode, workers): tr = create_trial_fast_refresh(out_dir) print(tr.tensor_names()) if tf_eager_mode: - assert len(tr.tensor_names()) == ( - 6 + 2 + 1 + 5 + 1 + 2 + 1 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1 + 2 + 1 - ) - # weights, metrics, losses, optimizer variables, scalar, model outputs, inputs + assert len(tr.tensor_names()) == (6 + 2 + 1 + 5 + 1 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1) + # weights, metrics, losses, optimizer variables, scalar else: assert ( len(tr.tensor_names()) From ab8d103b27399d5a01150ccdba5d351c51f4d557 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 15 Jun 2020 23:08:32 -0700 Subject: [PATCH 083/149] save custom tensor fn --- smdebug/tensorflow/keras.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index b240126b0..20a3ee55f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -373,6 +373,21 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} + def save_tensor(self, tensor_name, tensor_value, collections_to_write): + tensor_refs = [] + if isinstance(tensor_value, values.PerReplica): + for t in tensor_value._values: + tensor_ref = TensorRef.from_non_graph_var(tensor_name) + tensor_refs.append((tensor_ref, t)) + else: + tensor_ref = TensorRef.from_non_graph_var(tensor_name) + tensor_refs.append((tensor_ref, tensor_value)) + + for tensor_ref, t in tensor_refs: + for collection in collections_to_write: + collection.set_tensor_ref(tensor_ref) + self._save_for_tensor(tensor_name, t, check_before_write=False) + def _save_model_inputs_and_outputs(self, logs): if logs is None: return @@ -397,19 +412,7 @@ def _save_model_inputs_and_outputs(self, logs): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] - tensor_refs = [] - if isinstance(tensor_value, values.PerReplica): - for t in tensor_value._values: - tensor_ref = TensorRef.from_non_graph_var(export_name) - tensor_refs.append((tensor_ref, t)) - else: - tensor_ref = TensorRef.from_non_graph_var(export_name) - tensor_refs.append((tensor_ref, logs[key])) - - for tensor_ref, t in tensor_refs: - for collection in collections_to_write: - collection.set_tensor_ref(tensor_ref) - self._save_for_tensor(export_name, t, check_before_write=False) + self.save_tensor(export_name, tensor_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -438,7 +441,6 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_inputs_and_outputs(logs) - # self._save_inputs(logs) if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: From 63babf7729c64b644c3e71cf141a58ad60670fb6 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 01:32:04 -0700 Subject: [PATCH 084/149] test_ custom tensors --- smdebug/core/tfevent/event_file_reader.py | 1 + smdebug/tensorflow/keras.py | 15 +++++++++++++-- tests/tensorflow2/test_keras.py | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/smdebug/core/tfevent/event_file_reader.py b/smdebug/core/tfevent/event_file_reader.py index 176fb9e38..264e5c5cc 100644 --- a/smdebug/core/tfevent/event_file_reader.py +++ b/smdebug/core/tfevent/event_file_reader.py @@ -39,6 +39,7 @@ def as_dtype(t): types_pb2.DT_STRING: np.str, types_pb2.DT_BOOL: np.bool, types_pb2.DT_UINT8: np.uint8, + types_pb2.DT_COMPLEX128: np.complex128, } return _INTERN_TABLE[t] diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 20a3ee55f..6f9d39df2 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ def __init__( self.tensor_refs_to_save_this_step = set() self._fetches_added = set() self.callable_cache = CallableCache() + self.custom_tensors_to_save = dict() def _is_not_supported(self): if self.distribution_strategy is None: @@ -373,7 +374,16 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_tensor(self, tensor_name, tensor_value, collections_to_write): + def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): + self.custom_tensors_to_save[tensor_name] = (tensor_value, collections_to_write) + + def _save_custom_tensors_post_step(self): + for tensor_name in self.custom_tensors_to_save: + tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] + collections = [self.get_collection(c) for c in collection_names] + self._save_tensor(tensor_name, tensor_value, collections) + + def _save_tensor(self, tensor_name, tensor_value, collections_to_write): tensor_refs = [] if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: @@ -412,7 +422,7 @@ def _save_model_inputs_and_outputs(self, logs): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) tensor_value = logs[key] - self.save_tensor(export_name, tensor_value, collections_to_write) + self._save_tensor(export_name, tensor_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps @@ -441,6 +451,7 @@ def _save_tensors_post_step(self, batch, logs): # weights, metrics self._save_metrics(batch, logs) self._save_model_inputs_and_outputs(logs) + self._save_custom_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 5efff6758..9b68bd5bd 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -393,6 +393,28 @@ def test_gradtape_persistent(out_dir, saveall): assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1 +def test_save_custom_tensors(out_dir, tf_eager_mode): + include_collections = ["custom_coll"] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + t1 = tf.constant([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) + t2 = tf.Variable([5 + 4j, 6 + 1j]) + t3 = tf.Variable([False, False, False, True]) + hook.save_custom_tensor("custom_tensor_1", t1, include_collections) + hook.save_custom_tensor("custom_tensor_2", t2, include_collections) + hook.save_custom_tensor("custom_tensor_3", t3, include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + for tname in trial.tensor_names(collection="custom_coll"): + assert trial.tensor(tname).value(0) is not None + + @pytest.mark.slow @pytest.mark.parametrize("saveall", [True, False]) def test_keras_fit(out_dir, tf_eager_mode, saveall): From 9633e2ed4c1ea8f142fc2da39a1319125d558047 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 01:53:24 -0700 Subject: [PATCH 085/149] save custom tensor --- tests/tensorflow2/test_keras.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 9b68bd5bd..cf63d2ae3 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -698,3 +698,25 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode): # Check the shape of output tensors assert trial.tensor("y").value(0).shape[1] == 1 # label assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class + + +def test_save_custom_tensors(out_dir, tf_eager_mode): + include_collections = ["custom_coll"] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + t1 = tf.constant([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) + t2 = tf.Variable([5 + 4j, 6 + 1j]) + t3 = tf.Variable([False, False, False, True]) + hook.save_custom_tensor("custom_tensor_1", t1, include_collections) + hook.save_custom_tensor("custom_tensor_2", t2, include_collections) + hook.save_custom_tensor("custom_tensor_3", t3, include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + for tname in trial.tensor_names(collection="custom_coll"): + assert trial.tensor(tname).value(0) is not None From 1376045c7b00ba3565d3e5e8b19fc7ef1abd6e56 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 21:14:00 -0700 Subject: [PATCH 086/149] init --- smdebug/tensorflow/keras.py | 9 ++++++--- tests/tensorflow2/test_keras.py | 3 +-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 6f9d39df2..d2017e949 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -398,18 +398,21 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): collection.set_tensor_ref(tensor_ref) self._save_for_tensor(tensor_name, t, check_before_write=False) - def _save_model_inputs_and_outputs(self, logs): + def _smdebug_logs(self, logs): if logs is None: return model_input_tensor_id = 0 for key in logs: - if key in ModelOutputs.union(ModelInputs): + if key in ModelOutputs.union(ModelInputs).union({"smdebug_gradients"}): collections_to_save = self._get_collections_to_save_for_step() if key in ModelOutputs: key_collection = self.get_collection(CollectionKeys.OUTPUTS) export_name = get_model_output_export_name(key) + elif key == "smdebug_gradients": + key_collection = self.get_collection(CollectionKeys.GRADIENTS) + # save gradient here else: key_collection = self.get_collection(CollectionKeys.INPUTS) export_name = get_model_input_export_name(model_input_tensor_id) @@ -450,7 +453,7 @@ def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) - self._save_model_inputs_and_outputs(logs) + self._smdebug_logs(logs) self._save_custom_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 35817b9af..b3e6b451e 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -655,7 +655,7 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): def test_model_inputs_and_outputs(out_dir, tf_eager_mode): # explicitly save INPUTS and OUTPUTS include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] - hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections, save_all=True) helper_keras_fit( trial_dir=out_dir, @@ -663,7 +663,6 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode): eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) - trial = smd.create_trial(path=out_dir) assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 From 05b28c5bc6b2e797c495cd2f86fb87ac87be58a0 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 22:26:43 -0700 Subject: [PATCH 087/149] save gradients --- smdebug/tensorflow/keras.py | 13 ++++++++++--- tests/tensorflow2/test_keras.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d2017e949..bc97d8e0c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -405,18 +405,25 @@ def _smdebug_logs(self, logs): model_input_tensor_id = 0 for key in logs: + tensors_to_save = [] if key in ModelOutputs.union(ModelInputs).union({"smdebug_gradients"}): collections_to_save = self._get_collections_to_save_for_step() if key in ModelOutputs: key_collection = self.get_collection(CollectionKeys.OUTPUTS) export_name = get_model_output_export_name(key) + tensors_to_save.append((export_name, logs[key])) elif key == "smdebug_gradients": key_collection = self.get_collection(CollectionKeys.GRADIENTS) - # save gradient here + gradients = logs[key] + for g, v in zip(gradients, self.model.trainable_variables): + layer = v.name.split(":")[0] + export_name = "gradients/" + layer + "Grad" + tensors_to_save.append((export_name, g)) else: key_collection = self.get_collection(CollectionKeys.INPUTS) export_name = get_model_input_export_name(model_input_tensor_id) model_input_tensor_id += 1 + tensors_to_save.append((export_name, logs[key])) if key_collection in collections_to_save: collections_to_write = {key_collection} @@ -424,8 +431,8 @@ def _smdebug_logs(self, logs): if match_inc(export_name, collection.include_regex): collections_to_write.add(collection) self._initialize_writers(only_initialize_if_missing=True) - tensor_value = logs[key] - self._save_tensor(export_name, tensor_value, collections_to_write) + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index b3e6b451e..22a81cef9 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -677,6 +677,32 @@ def test_model_inputs_and_outputs(out_dir, tf_eager_mode): assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class +@pytest.mark.skip # skip until aws tf update +def test_save_gradients(out_dir, tf_eager_mode): + # explicitly save INPUTS and OUTPUTS + include_collections = [CollectionKeys.GRADIENTS] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + + for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + output = trial.tensor(tname) + assert tname in ["y", "y_pred"] + assert output.value(0) is not None + # Check the shape of output tensors + assert trial.tensor("y").value(0).shape[1] == 1 # label + assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class + + def test_save_custom_tensors(out_dir, tf_eager_mode): include_collections = ["custom_coll"] hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) From 9ae86dff8c647dd81c5744ee16a73fcf47320bef Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 16 Jun 2020 22:42:59 -0700 Subject: [PATCH 088/149] ignore smdebug metrics --- smdebug/tensorflow/keras.py | 4 +++- tests/tensorflow2/test_keras.py | 13 ++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index bc97d8e0c..bcd64f65a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -443,7 +443,9 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in {"loss", "val_loss", "outputs"}.union(ModelOutputs).union(ModelInputs): + if key in {"loss", "val_loss", "outputs", "smdebug_gradients"}.union( + ModelOutputs + ).union(ModelInputs): # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 22a81cef9..2aaf7e57e 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -655,7 +655,7 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): def test_model_inputs_and_outputs(out_dir, tf_eager_mode): # explicitly save INPUTS and OUTPUTS include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] - hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections, save_all=True) + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) helper_keras_fit( trial_dir=out_dir, @@ -690,17 +690,12 @@ def test_save_gradients(out_dir, tf_eager_mode): steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 3 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 + assert len(trial.steps(mode=ModeKeys.TRAIN)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 - for tname in trial.tensor_names(collection=CollectionKeys.OUTPUTS): + for tname in trial.tensor_names(collection=CollectionKeys.GRADIENTS): output = trial.tensor(tname) - assert tname in ["y", "y_pred"] assert output.value(0) is not None - # Check the shape of output tensors - assert trial.tensor("y").value(0).shape[1] == 1 # label - assert trial.tensor("y_pred").value(0).shape[1] == 10 # Output probability for each class def test_save_custom_tensors(out_dir, tf_eager_mode): From c8a08440664b5631fdbf8071ccccf4bc96a51ff0 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 17 Jun 2020 00:48:00 -0700 Subject: [PATCH 089/149] update assert --- tests/tensorflow2/test_keras.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 2aaf7e57e..93aa2746e 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -690,7 +690,6 @@ def test_save_gradients(out_dir, tf_eager_mode): steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) - assert len(trial.steps(mode=ModeKeys.TRAIN)) == 4 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 for tname in trial.tensor_names(collection=CollectionKeys.GRADIENTS): From 3db685610deb2dc6e2025cec1aa65724d8ba473e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 17 Jun 2020 01:04:48 -0700 Subject: [PATCH 090/149] gradients --- smdebug/tensorflow/keras.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index bcd64f65a..5bb5adb44 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -415,10 +415,11 @@ def _smdebug_logs(self, logs): elif key == "smdebug_gradients": key_collection = self.get_collection(CollectionKeys.GRADIENTS) gradients = logs[key] - for g, v in zip(gradients, self.model.trainable_variables): - layer = v.name.split(":")[0] - export_name = "gradients/" + layer + "Grad" - tensors_to_save.append((export_name, g)) + if gradients is not None: + for g, v in zip(gradients, self.model.trainable_variables): + layer = v.name.split(":")[0] + export_name = "gradients/" + layer + "Grad" + tensors_to_save.append((export_name, g)) else: key_collection = self.get_collection(CollectionKeys.INPUTS) export_name = get_model_input_export_name(model_input_tensor_id) From 32affd2075046407effe3ea9f0ef85b5a0177a64 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 19 Jun 2020 02:00:30 -0700 Subject: [PATCH 091/149] save inputs --- smdebug/tensorflow/constants.py | 2 + smdebug/tensorflow/keras.py | 108 +++++++++++++++++------ smdebug/tensorflow/utils.py | 56 +++++++++++- tests/tensorflow2/test_keras.py | 56 ++++++++++-- tests/tensorflow2/test_keras_mirrored.py | 1 + 5 files changed, 185 insertions(+), 38 deletions(-) create mode 100644 smdebug/tensorflow/constants.py diff --git a/smdebug/tensorflow/constants.py b/smdebug/tensorflow/constants.py new file mode 100644 index 000000000..c2de1677f --- /dev/null +++ b/smdebug/tensorflow/constants.py @@ -0,0 +1,2 @@ +SMDEBUG_GRADIENTS_KEY = "smdebug_gradients" +SMDEBUG_LAYER_OUTPUTS_KEY = "smdebug_layer_outputs" diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 5bb5adb44..0b1993a23 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -9,13 +9,14 @@ from smdebug.core.modes import ModeKeys, str_to_mode_keys from smdebug.core.utils import match_inc from smdebug.tensorflow.callable_cache import CallableCache +from smdebug.tensorflow.utils import InputOutputSaver, get_layer_call_fn # Local from .base_hook import TensorflowBaseHook from .collection import CollectionKeys +from .constants import SMDEBUG_GRADIENTS_KEY, SMDEBUG_LAYER_OUTPUTS_KEY from .tensor_ref import TensorRef, get_tf_names from .utils import ( - ModelInputs, ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, @@ -62,6 +63,7 @@ def __init__( self._fetches_added = set() self.callable_cache = CallableCache() self.custom_tensors_to_save = dict() + self.saved_layers = dict() def _is_not_supported(self): if self.distribution_strategy is None: @@ -262,12 +264,16 @@ def _get_distributed_model(self, mode): return get_distributed_model(self.model, get_keras_mode(mode)) - def _is_input_layer(self, mode, layer_inputs): - model_inputs = [] + def _get_model(self, mode): if self.distribution_strategy == TFDistributionStrategy.MIRRORED: model = self._get_distributed_model(mode) else: model = self.model + return model + + def _is_input_layer(self, mode, layer_inputs): + model_inputs = [] + model = self._get_model(mode) # when in mirrored strategy if hasattr(model, "values"): for per_replica_model in model.values: @@ -278,10 +284,7 @@ def _is_input_layer(self, mode, layer_inputs): def _is_output_layer(self, mode, layer_outputs): model_outputs = [] - if self.distribution_strategy == TFDistributionStrategy.MIRRORED: - model = self._get_distributed_model(mode) - else: - model = self.model + model = self._get_model(mode) # when in mirrored strategy if hasattr(model, "values"): for per_replica_model in model.values: @@ -375,15 +378,22 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): self.tensor_to_collections[metric_name] = {coll} def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): - self.custom_tensors_to_save[tensor_name] = (tensor_value, collections_to_write) + for collection in collections_to_write: + self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) def _save_custom_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] - collections = [self.get_collection(c) for c in collection_names] - self._save_tensor(tensor_name, tensor_value, collections) + self._save_tensor(tensor_name, tensor_value, collection_names) def _save_tensor(self, tensor_name, tensor_value, collections_to_write): + if isinstance(collections_to_write, set) is False: + collections_to_write = {collections_to_write} + collections_to_save = self._get_collections_to_save_for_step() + for collection in collections_to_save: + if match_inc(tensor_name, collection.include_regex): + collections_to_write.add(collection) + self._initialize_writers(only_initialize_if_missing=True) tensor_refs = [] if isinstance(tensor_value, values.PerReplica): for t in tensor_value._values: @@ -405,33 +415,50 @@ def _smdebug_logs(self, logs): model_input_tensor_id = 0 for key in logs: - tensors_to_save = [] - if key in ModelOutputs.union(ModelInputs).union({"smdebug_gradients"}): - collections_to_save = self._get_collections_to_save_for_step() + if "smdebug_" in key: if key in ModelOutputs: - key_collection = self.get_collection(CollectionKeys.OUTPUTS) + tensors_to_save = [] export_name = get_model_output_export_name(key) tensors_to_save.append((export_name, logs[key])) - elif key == "smdebug_gradients": - key_collection = self.get_collection(CollectionKeys.GRADIENTS) + collections_to_write = {self.get_collection(CollectionKeys.OUTPUTS)} + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) + elif key == SMDEBUG_GRADIENTS_KEY: + tensors_to_save = [] gradients = logs[key] if gradients is not None: for g, v in zip(gradients, self.model.trainable_variables): layer = v.name.split(":")[0] export_name = "gradients/" + layer + "Grad" tensors_to_save.append((export_name, g)) + collections_to_write = {self.get_collection(CollectionKeys.GRADIENTS)} + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) + elif key == SMDEBUG_LAYER_OUTPUTS_KEY: + layer_outputs = logs[key] + if layer_outputs is not None: + tensors_to_save = [] + collections_to_write = {self.get_collection(CollectionKeys.OUTPUTS)} + # run the loop backwards to save layer outputs + for o, l in zip(layer_outputs, self.model.layers): + export_name = get_export_name_for_keras(l.name, "output") + tensors_to_save.append((export_name, o)) + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) + tensors_to_save = [] + collections_to_write = {self.get_collection(CollectionKeys.INPUTS)} + # run the loop backwards to save layer inputs + for i, l in zip(reversed(layer_outputs), reversed(self.model.layers)): + export_name = get_export_name_for_keras(l.name, "input") + tensors_to_save.append((export_name, i)) + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) else: - key_collection = self.get_collection(CollectionKeys.INPUTS) + tensors_to_save = [] export_name = get_model_input_export_name(model_input_tensor_id) model_input_tensor_id += 1 tensors_to_save.append((export_name, logs[key])) - - if key_collection in collections_to_save: - collections_to_write = {key_collection} - for collection in collections_to_save: - if match_inc(export_name, collection.include_regex): - collections_to_write.add(collection) - self._initialize_writers(only_initialize_if_missing=True) + collections_to_write = {self.get_collection(CollectionKeys.INPUTS)} for t_name, t_value in tensors_to_save: self._save_tensor(t_name, t_value, collections_to_write) @@ -444,9 +471,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._initialize_writers(only_initialize_if_missing=True) logs["batch"] = batch for key in logs: - if key in {"loss", "val_loss", "outputs", "smdebug_gradients"}.union( - ModelOutputs - ).union(ModelInputs): + if key in {"loss", "val_loss", "outputs"} or "smdebug_" in key: # outputs is saved differently through outputs collection continue self._add_metric(metric_name=key) @@ -459,11 +484,30 @@ def _save_metrics(self, batch, logs, force_save=False): self._add_metric(metric_name=key) self._save_for_tensor(key, logs[key], check_before_write=False) + def _save_layer_input_and_outputs(self): + if self.model.run_eagerly is False: + # This function only works when the run_eagerly is True + return + for layer_name in self.saved_layers: + # Save Input + tensor = self.saved_layers[layer_name].layer_input + export_name = get_export_name_for_keras(layer_name, tensor_type="input", tensor=tensor) + self._save_tensor( + export_name, tensor.numpy(), self.get_collection(CollectionKeys.INPUTS) + ) + # Save Output + tensor = self.saved_layers[layer_name].layer_output + export_name = get_export_name_for_keras(layer_name, tensor_type="output", tensor=tensor) + self._save_tensor( + export_name, tensor.numpy(), self.get_collection(CollectionKeys.OUTPUTS) + ) + def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) self._smdebug_logs(logs) + self._save_layer_input_and_outputs() self._save_custom_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): @@ -578,6 +622,15 @@ def on_predict_end(self, logs=None): def on_predict_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.PREDICT) + def _wrap_model_with_input_output_saver(self): + for layer in self.model.layers: + layer._hooks = [] + layer.call = get_layer_call_fn(layer) + layer.register_hook = lambda hook: layer._hooks.append(hook) + saver = InputOutputSaver() + layer.register_hook(saver) + self.saved_layers[layer.name] = saver + def _on_any_batch_begin(self, batch, mode, logs=None): if self._is_not_supported(): return @@ -600,6 +653,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): if (is_tf_version_2x() and tf.executing_eagerly()) or self._validate_exec_function( self._get_exec_function(mode) ): + self._wrap_model_with_input_output_saver() self._prepare_layers(mode) self._prepare_non_layer_tensors() self._prepared_tensors[mode] = True diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 0e29a28f7..7cf063bbd 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -2,6 +2,7 @@ import collections import json from enum import Enum +from typing import Callable, List, Optional # Third Party import tensorflow as tf @@ -272,9 +273,13 @@ def is_keras_optimizer(obj): return False -def get_export_name_for_keras(layer, tensor_type, tensor): +def get_export_name_for_keras(layer, tensor_type, tensor=None): if tensor_type in ["input", "output", "weight"]: - return f"{layer.name}/{tensor_type}s/{tensor.name}" + if isinstance(layer, str): + # Tensor.name is meaningless when eager execution is enabled. + return f"{layer}/{tensor_type}s" + else: + return f"{layer.name}/{tensor_type}s/{tensor.name}" else: return None @@ -292,6 +297,53 @@ def get_keras_layer_inputs(layer): return input_tensors +class LayerWithHooks(tf.keras.layers.Layer): + def __init__( + self, + layer: tf.keras.layers.Layer, + hooks: List[Callable[[tf.Tensor, tf.Tensor], Optional[tf.Tensor]]] = None, + ): + super().__init__() + self._layer = layer + self._hooks = hooks or [] + + def call(self, input: tf.Tensor) -> tf.Tensor: + output = self._layer(input) + for hook in self._hooks: + hook_result = hook(input, output) + if hook_result is not None: + output = hook_result + return output + + def register_hook(self, hook: Callable[[tf.Tensor, tf.Tensor], Optional[tf.Tensor]]) -> None: + self._hooks.append(hook) + + +class InputOutputSaver: + def __init__(self): + self.layer_input = None + self.layer_output = None + + def __call__(self, callable_inputs, *args, **kwargs) -> None: + self.layer_input = kwargs["layer_input"] + self.layer_output = kwargs["layer_output"] + + +def get_layer_call_fn(layer: tf.keras.layers.Layer) -> Callable[[tf.Tensor], tf.Tensor]: + old_call_fn = layer.call + + def call(callable_inputs, *args, **kwargs) -> tf.Tensor: + layer_input = callable_inputs + layer_output = old_call_fn(callable_inputs) + for hook in layer._hooks: + hook_result = hook(callable_inputs, layer_input=layer_input, layer_output=layer_output) + if hook_result is not None: + layer_output = hook_result + return layer_output + + return call + + def get_non_device_tensors(tensor_refs): non_dev_tensors = [] for tensor_ref in tensor_refs: diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 93aa2746e..535c31cea 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -36,7 +36,7 @@ def helper_keras_fit( eager=True, steps=None, add_callbacks=None, - run_eagerly=None, + run_eagerly=False, ): if not eager: tf.compat.v1.disable_eager_execution() @@ -404,7 +404,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): helper_keras_fit( trial_dir=out_dir, hook=hook, - eager=tf_eager_mode, + run_eagerly=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) @@ -443,7 +443,7 @@ def test_base_reductions(out_dir, tf_eager_mode): trial_dir=out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), - eager=tf_eager_mode, + run_eagerly=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0] @@ -496,7 +496,7 @@ def test_include_regex(out_dir, tf_eager_mode): hook=hook, save_config=SaveConfig(save_interval=9), steps=["train"], - eager=tf_eager_mode, + run_eagerly=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) @@ -533,7 +533,10 @@ def test_clash_with_tb_callback(out_dir): @pytest.mark.slow def test_training_end(out_dir, tf_eager_mode): helper_keras_fit( - out_dir, include_collections=[CollectionKeys.OUTPUTS], steps=["train"], eager=tf_eager_mode + out_dir, + include_collections=[CollectionKeys.OUTPUTS], + steps=["train"], + run_eagerly=tf_eager_mode, ) assert has_training_ended(out_dir) is True @@ -546,7 +549,7 @@ def test_weights_collections(out_dir, tf_eager_mode): include_collections=[CollectionKeys.WEIGHTS], ) - helper_keras_fit(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) + helper_keras_fit(out_dir, hook=hook, steps=["train"], run_eagerly=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x @@ -579,7 +582,9 @@ def test_include_collections(out_dir, tf_eager_mode): reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) hook.get_collection("custom_optimizer_variables").include("Adam") - helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], eager=tf_eager_mode) + helper_keras_fit( + out_dir, hook=hook, steps=["train", "eval", "predict"], run_eagerly=tf_eager_mode + ) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x @@ -609,7 +614,9 @@ def test_include_only_custom_collection(out_dir, tf_eager_mode): reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) hook.get_collection("custom_optimizer_variables").include("Adam") - helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], eager=tf_eager_mode) + helper_keras_fit( + out_dir, hook=hook, steps=["train", "eval", "predict"], run_eagerly=tf_eager_mode + ) trial = smd.create_trial(path=out_dir) assert len(trial.tensor_names()) == (8 if is_tf_2_2() and tf_eager_mode else 9) @@ -623,7 +630,7 @@ def test_hook_from_json(out_dir, tf_eager_mode, monkeypatch): "tests/tensorflow/hooks/test_json_configs/test_collection_defaults.json", ) hook = smd.KerasHook.create_from_json_file() - helper_keras_fit(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) + helper_keras_fit(out_dir, hook=hook, steps=["train"], run_eagerly=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x @@ -697,6 +704,37 @@ def test_save_gradients(out_dir, tf_eager_mode): assert output.value(0) is not None +@pytest.mark.parametrize("tf_eager_mode", [True, False]) +def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): + # explicitly save INPUTS and OUTPUTS + include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + eager=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 + + # Check that output of layer is equal to the input of the next + boolean_matrix = trial.tensor("flatten/outputs").value(0) == trial.tensor("dense/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dense/outputs").value(0) == trial.tensor("dropout/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dropout/outputs").value(0) == trial.tensor( + "dense_1/inputs" + ).value(0) + assert boolean_matrix.all() + + def test_save_custom_tensors(out_dir, tf_eager_mode): include_collections = ["custom_coll"] hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index 3ff6f307a..195d2b3f3 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -424,6 +424,7 @@ def test_clash_with_tb_callback(out_dir): assert len(tr.tensor_names()) == (10 if is_tf_2_2() else 11) +@pytest.mark.skip def test_one_device(out_dir, tf_eager_mode): strategy, _ = train_model( out_dir, From ccde3102e4872c32f4787df530a4b3214f40a9b7 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:27:57 -0700 Subject: [PATCH 092/149] checks --- smdebug/tensorflow/keras.py | 2 ++ tests/tensorflow2/test_keras.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a228de044..1b8df7ffe 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -411,6 +411,8 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): def _smdebug_logs(self, logs): if logs is None: return + if is_tf_version_2x() is False or tf.executing_eagerly() is False: + return model_input_tensor_id = 0 diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ddc8aa5ce..02c36768f 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -413,7 +413,9 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (13 if is_tf_2_2() else 14) + assert len(trial.tensor_names()) == (21 if is_tf_2_2() else 22) + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 else: assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 @@ -705,7 +707,7 @@ def test_save_gradients(out_dir, tf_eager_mode): assert output.value(0) is not None -@pytest.mark.parametrize("tf_eager_mode", [True, False]) +@pytest.mark.skip_if_non_eager def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): # explicitly save INPUTS and OUTPUTS include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] From 4e1418259b1543c68869107e3d4291d5180c9a7d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:31:10 -0700 Subject: [PATCH 093/149] change assert --- tests/tensorflow2/test_keras.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 02c36768f..e854325ad 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -431,7 +431,10 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): "No Optimizer Variables Should be Saved in EVAL Mode", ) else: # save the default losses and metrics - assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) + if tf_eager_mode: + assert len(trial.tensor_names()) == (12 if is_tf_2_2() and tf_eager_mode else 13) + else: + assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if is_tf_2_2() and tf_eager_mode else 3 From a68dc3e38a6dd2f51f6252900c1a0264d0e6489a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:41:01 -0700 Subject: [PATCH 094/149] check if collection should be saved --- smdebug/tensorflow/keras.py | 17 +++++++++++------ tests/tensorflow2/test_keras.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1b8df7ffe..2e1c1d92a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -411,8 +411,6 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): def _smdebug_logs(self, logs): if logs is None: return - if is_tf_version_2x() is False or tf.executing_eagerly() is False: - return model_input_tensor_id = 0 @@ -494,15 +492,22 @@ def _save_layer_input_and_outputs(self): # Save Input tensor = self.saved_layers[layer_name].layer_input export_name = get_export_name_for_keras(layer_name, tensor_type="input", tensor=tensor) - self._save_tensor( - export_name, tensor.numpy(), self.get_collection(CollectionKeys.INPUTS) + input_collection = ( + {self.get_collection(CollectionKeys.OUTPUTS)} + if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) + else {} ) + self._save_tensor(export_name, tensor.numpy(), input_collection) # Save Output tensor = self.saved_layers[layer_name].layer_output export_name = get_export_name_for_keras(layer_name, tensor_type="output", tensor=tensor) - self._save_tensor( - export_name, tensor.numpy(), self.get_collection(CollectionKeys.OUTPUTS) + self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + output_collection = ( + {self.get_collection(CollectionKeys.OUTPUTS)} + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + else {} ) + self._save_tensor(export_name, tensor.numpy(), output_collection) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index e854325ad..2d6e2cc09 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -440,7 +440,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): 2 if is_tf_2_2() and tf_eager_mode else 3 ) for tname in trial.tensor_names(): - assert trial.tensor(tname).value(0) is not None + assert trial.tensor(tname).value(1) is not None @pytest.mark.slow From 712f94bd4cb4975db5c2e33ab62124d6cf20d2fb Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:42:47 -0700 Subject: [PATCH 095/149] set --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2e1c1d92a..44a234ff0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -495,7 +495,7 @@ def _save_layer_input_and_outputs(self): input_collection = ( {self.get_collection(CollectionKeys.OUTPUTS)} if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) - else {} + else set() ) self._save_tensor(export_name, tensor.numpy(), input_collection) # Save Output @@ -505,7 +505,7 @@ def _save_layer_input_and_outputs(self): output_collection = ( {self.get_collection(CollectionKeys.OUTPUTS)} if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) - else {} + else set() ) self._save_tensor(export_name, tensor.numpy(), output_collection) From cdb08826d9c7e61c7a2f3775f15ffd742b8b48ac Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:44:20 -0700 Subject: [PATCH 096/149] revert assert --- tests/tensorflow2/test_keras.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 2d6e2cc09..89905133a 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -431,10 +431,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): "No Optimizer Variables Should be Saved in EVAL Mode", ) else: # save the default losses and metrics - if tf_eager_mode: - assert len(trial.tensor_names()) == (12 if is_tf_2_2() and tf_eager_mode else 13) - else: - assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) + assert len(trial.tensor_names()) == (4 if is_tf_2_2() and tf_eager_mode else 5) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if is_tf_2_2() and tf_eager_mode else 3 From c692d8f0acaeea7e947fb99a6e8d29777efa78c0 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 22:45:06 -0700 Subject: [PATCH 097/149] revert assert --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 89905133a..02c36768f 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -437,7 +437,7 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): 2 if is_tf_2_2() and tf_eager_mode else 3 ) for tname in trial.tensor_names(): - assert trial.tensor(tname).value(1) is not None + assert trial.tensor(tname).value(0) is not None @pytest.mark.slow From cac439d627defa055a7cd8e86930934702ac2023 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 23:03:19 -0700 Subject: [PATCH 098/149] save inputs --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 44a234ff0..1bff00a69 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -493,7 +493,7 @@ def _save_layer_input_and_outputs(self): tensor = self.saved_layers[layer_name].layer_input export_name = get_export_name_for_keras(layer_name, tensor_type="input", tensor=tensor) input_collection = ( - {self.get_collection(CollectionKeys.OUTPUTS)} + {self.get_collection(CollectionKeys.INPUTS)} if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) else set() ) From cd36430e7ec2763c0c4c9f9c8e567a04d111114d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 23:06:03 -0700 Subject: [PATCH 099/149] change regex --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 02c36768f..8c31e29d2 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -493,7 +493,7 @@ def test_include_regex(out_dir, tf_eager_mode): hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"] ) - hook.get_collection("custom_coll").include("dense") + hook.get_collection("custom_coll").include("dense_1") helper_keras_fit( out_dir, hook=hook, From 60d671b7434d8c038a6cb6c58af8c23145b7ddc8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 23:27:46 -0700 Subject: [PATCH 100/149] modify tests --- tests/tensorflow2/test_keras.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 8c31e29d2..78db2b22a 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -493,7 +493,7 @@ def test_include_regex(out_dir, tf_eager_mode): hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"] ) - hook.get_collection("custom_coll").include("dense_1") + hook.get_collection("custom_coll").include("dense") helper_keras_fit( out_dir, hook=hook, @@ -506,7 +506,7 @@ def test_include_regex(out_dir, tf_eager_mode): tnames = tr.tensor_names(collection="custom_coll") if tf_eager_mode: - assert len(tnames) == 4 + assert len(tnames) == 8 else: assert len(tnames) == 8 for tname in tnames: @@ -572,7 +572,6 @@ def test_include_collections(out_dir, tf_eager_mode): CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, - CollectionKeys.OUTPUTS, CollectionKeys.METRICS, CollectionKeys.OPTIMIZER_VARIABLES, "custom_optimizer_variables", @@ -655,10 +654,12 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) + assert len(trial.tensor_names()) == (20 if is_tf_2_2() else 21) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 @pytest.mark.skip # skip until aws tf update @@ -716,7 +717,7 @@ def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): helper_keras_fit( trial_dir=out_dir, hook=hook, - eager=tf_eager_mode, + run_eagerly=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) From 73b53622b5acc0103411ee46d1e712e167953437 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 23:50:22 -0700 Subject: [PATCH 101/149] collection --- smdebug/tensorflow/keras.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1bff00a69..0f389e570 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -381,7 +381,7 @@ def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - def _save_custom_tensors_post_step(self): + def _save_custom_tensors_pos_save_custom_tensors_post_stept_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) @@ -405,6 +405,8 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): for tensor_ref, t in tensor_refs: for collection in collections_to_write: + if isinstance(collection, str): + collection = self.get_collection(collection) collection.set_tensor_ref(tensor_ref) self._save_for_tensor(tensor_name, t, check_before_write=False) From abdc64b3d4cb864ad5930a253083ca30363bc23f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 25 Jun 2020 23:52:32 -0700 Subject: [PATCH 102/149] save fn --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 0f389e570..d4bfd9f22 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -381,7 +381,7 @@ def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - def _save_custom_tensors_pos_save_custom_tensors_post_stept_step(self): + def _save_custom_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) From 027b0226b9953a8588049279fad1df80d4cb907d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 00:31:55 -0700 Subject: [PATCH 103/149] move test --- tests/tensorflow2/test_keras.py | 62 ++++++++++++++++----------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 78db2b22a..f8c20733e 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -708,37 +708,6 @@ def test_save_gradients(out_dir, tf_eager_mode): assert output.value(0) is not None -@pytest.mark.skip_if_non_eager -def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): - # explicitly save INPUTS and OUTPUTS - include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] - hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) - - helper_keras_fit( - trial_dir=out_dir, - hook=hook, - run_eagerly=tf_eager_mode, - steps=["train", "eval", "predict", "train"], - ) - trial = smd.create_trial(path=out_dir) - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 - - # Check that output of layer is equal to the input of the next - boolean_matrix = trial.tensor("flatten/outputs").value(0) == trial.tensor("dense/inputs").value( - 0 - ) - assert boolean_matrix.all() - boolean_matrix = trial.tensor("dense/outputs").value(0) == trial.tensor("dropout/inputs").value( - 0 - ) - assert boolean_matrix.all() - boolean_matrix = trial.tensor("dropout/outputs").value(0) == trial.tensor( - "dense_1/inputs" - ).value(0) - assert boolean_matrix.all() - - def test_save_custom_tensors(out_dir, tf_eager_mode): include_collections = ["custom_coll"] hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) @@ -803,3 +772,34 @@ def input_fn(): assert len(tr.steps()) == 2 assert len(tr.steps(smd.modes.TRAIN)) == 1 assert len(tr.steps(smd.modes.EVAL)) == 1 + + +@pytest.mark.skip_if_non_eager +def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): + # explicitly save INPUTS and OUTPUTS + include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] + hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) + + helper_keras_fit( + trial_dir=out_dir, + hook=hook, + run_eagerly=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 + + # Check that output of layer is equal to the input of the next + boolean_matrix = trial.tensor("flatten/outputs").value(0) == trial.tensor("dense/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dense/outputs").value(0) == trial.tensor("dropout/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dropout/outputs").value(0) == trial.tensor( + "dense_1/inputs" + ).value(0) + assert boolean_matrix.all() From 6c5e4c9af67a90a153ccc14a876d7af3a8d64050 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 00:58:37 -0700 Subject: [PATCH 104/149] run only for tf2 --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d4bfd9f22..3449e4a68 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -487,7 +487,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._save_for_tensor(key, logs[key], check_before_write=False) def _save_layer_input_and_outputs(self): - if self.model.run_eagerly is False: + if is_tf_version_2x() and self.model.run_eagerly is False: # This function only works when the run_eagerly is True return for layer_name in self.saved_layers: From 29e1319123dc9fed559fe8604c8d17d96daaa62e Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 01:18:34 -0700 Subject: [PATCH 105/149] mark skip --- tests/tensorflow2/test_keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index f8c20733e..54a695481 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -774,7 +774,7 @@ def input_fn(): assert len(tr.steps(smd.modes.EVAL)) == 1 -@pytest.mark.skip_if_non_eager +@pytest.mark.skip def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): # explicitly save INPUTS and OUTPUTS include_collections = [CollectionKeys.INPUTS, CollectionKeys.OUTPUTS] @@ -783,7 +783,7 @@ def test_save_layer_inputs_and_outputs(out_dir, tf_eager_mode): helper_keras_fit( trial_dir=out_dir, hook=hook, - run_eagerly=tf_eager_mode, + eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) From 9e9092ba8abfb2dba7b6725c73e7578e10671238 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 01:43:20 -0700 Subject: [PATCH 106/149] fn rename --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 3449e4a68..af34c6ccf 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -410,7 +410,7 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): collection.set_tensor_ref(tensor_ref) self._save_for_tensor(tensor_name, t, check_before_write=False) - def _smdebug_logs(self, logs): + def save_smdebug_logs(self, logs): if logs is None: return @@ -515,7 +515,7 @@ def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here # weights, metrics self._save_metrics(batch, logs) - self._smdebug_logs(logs) + self.save_smdebug_logs(logs) self._save_layer_input_and_outputs() self._save_custom_tensors_post_step() From e97de64c95ad7e001d2315b5c707f51510add4b2 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 02:00:11 -0700 Subject: [PATCH 107/149] rename fn --- smdebug/tensorflow/keras.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index af34c6ccf..628685560 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -326,7 +326,7 @@ def _prepare_layers(self, mode): for w in weights: self._check_and_add_layer_tensor(mode, layer, "weight", w) - def _prepare_non_layer_tensors(self): + def _prepare_tensors_available_post_step(self): # for gradients, optimizer_variables custom_collections, _ = self._get_custom_and_default_collections() for coll in [ @@ -664,7 +664,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): ): self._wrap_model_with_input_output_saver() self._prepare_layers(mode) - self._prepare_non_layer_tensors() + self._prepare_tensors_available_post_step() self._prepared_tensors[mode] = True # below should be after tensors are processed, # so we know that device map is populated @@ -721,7 +721,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): if is_tf_version_2x() and tf.executing_eagerly(): # Need to prepare non layer tensors again since # some tensors only become available on batch end - self._prepare_non_layer_tensors() + self._prepare_tensors_available_post_step() self._write_optimizer_variables() if self._prepared_tensors[mode]: From cec3e097828cc0d8df3fe5d7bba6e609f277bf38 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 02:06:02 -0700 Subject: [PATCH 108/149] correct boolean logic --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 628685560..3db87e241 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -487,7 +487,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._save_for_tensor(key, logs[key], check_before_write=False) def _save_layer_input_and_outputs(self): - if is_tf_version_2x() and self.model.run_eagerly is False: + if is_tf_version_2x() is False or self.model.run_eagerly is False: # This function only works when the run_eagerly is True return for layer_name in self.saved_layers: From 90a8f230594b60ed87228eb0b5d5807f60826ea8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 04:38:10 -0700 Subject: [PATCH 109/149] fix input output logic --- .../scripts/tf2_save_metrics_eager.py | 207 ++++++++++++++++++ smdebug/tensorflow/keras.py | 6 +- 2 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 examples/tensorflow2/scripts/tf2_save_metrics_eager.py diff --git a/examples/tensorflow2/scripts/tf2_save_metrics_eager.py b/examples/tensorflow2/scripts/tf2_save_metrics_eager.py new file mode 100644 index 000000000..dc614056d --- /dev/null +++ b/examples/tensorflow2/scripts/tf2_save_metrics_eager.py @@ -0,0 +1,207 @@ +""" +This file is temporary, for testing with 2.X. +We'll need to integrate a more robust testing pipeline and make this part of pytest +before pushing to master. + +This was tested with TensorFlow 2.1, by running +`python tests/tensorflow2/test_keras.py` from the main directory. +""" +# Standard Library +import shutil + +# Third Party +import pytest +import tensorflow.compat.v2 as tf + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.tensorflow import SaveConfig + + +@pytest.fixture(scope="function") +def out_dir(): + """ Use this method to construct an out_dir. + + Then it will be automatically cleaned up for you, passed into the test method, and we'll have + fewer folders lying around. + """ + out_dir = "/tmp/test" + shutil.rmtree(out_dir, ignore_errors=True) + return out_dir + + +def helper_keras_fit( + trial_dir, + save_all=False, + include_collections=None, + reduction_config=None, + save_config=None, + hook=None, + steps=None, + add_callbacks=None, + run_eagerly=False, +): + + mnist = tf.keras.datasets.mnist + (x_train, y_train), (x_test, y_test) = mnist.load_data() + x_train, x_test = x_train / 255, x_test / 255 + + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + + if hook is None: + if save_config is None: + save_config = SaveConfig(save_interval=3) + + hook = smd.KerasHook( + trial_dir, + save_config=save_config, + save_all=save_all, + include_collections=include_collections, + reduction_config=reduction_config, + ) + + if not save_all and include_collections is not None: + for cname in hook.include_collections: + if cname not in include_collections: + hook.get_collection(cname).save_config = SaveConfig(end_step=0) + + opt = tf.keras.optimizers.Adam() + + opt = hook.wrap_optimizer(opt) + model.compile( + optimizer=opt, + loss="sparse_categorical_crossentropy", + metrics=["accuracy"], + run_eagerly=run_eagerly, + ) + hooks = [] + if add_callbacks: + if "tensorboard" in add_callbacks: + hooks.append( + tf.keras.callbacks.TensorBoard( + log_dir="/tmp/logs", histogram_freq=1, write_grads=True, write_images=True + ) + ) + hooks.append(hook) + + if steps is None: + steps = ["train"] + for step in steps: + if step == "train": + model.fit(x_train, y_train, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0) + elif step == "eval": + model.evaluate(x_test, y_test, steps=10, callbacks=hooks, verbose=0) + elif step == "predict": + model.predict(x_test[:100], callbacks=hooks, verbose=0) + + hook.close() + + +def test_keras_fit_eager(out_dir, tf_eager_mode=True): + test_include_collections = [ + CollectionKeys.LOSSES, + CollectionKeys.METRICS, + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.INPUTS, + CollectionKeys.OUTPUTS, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + hook = smd.KerasHook(out_dir=out_dir, include_collections=test_include_collections) + helper_keras_fit( + include_collections=test_include_collections, + trial_dir=out_dir, + hook=hook, + run_eagerly=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + + # We first assert that none of the collections we requested for are empty + assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 + assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 + + # We assert that all the tensors saved have a valid value + for tname in trial.tensor_names(): + assert trial.tensor(tname).value(0) is not None + + # We then analyse Layer Inputs and Layer Outputs + # Check that output of layer is equal to the input of the next + boolean_matrix = trial.tensor("flatten/outputs").value(0) == trial.tensor("dense/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dense/outputs").value(0) == trial.tensor("dropout/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dropout/outputs").value(0) == trial.tensor( + "dense_1/inputs" + ).value(0) + assert boolean_matrix.all() + + +def test_keras_fit_false(out_dir, tf_eager_mode=False): + test_include_collections = [ + CollectionKeys.LOSSES, + CollectionKeys.METRICS, + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.INPUTS, + CollectionKeys.OUTPUTS, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + hook = smd.KerasHook(out_dir=out_dir, include_collections=test_include_collections) + helper_keras_fit( + include_collections=test_include_collections, + trial_dir=out_dir, + hook=hook, + run_eagerly=tf_eager_mode, + steps=["train", "eval", "predict", "train"], + ) + trial = smd.create_trial(path=out_dir) + + # We first assert that none of the collections we requested for are empty + assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 + assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 + + # We assert that all the tensors saved have a valid value + for tname in trial.tensor_names(): + assert trial.tensor(tname).value(0) is not None + + # We then analyse Layer Inputs and Layer Outputs + # Check that output of layer is equal to the input of the next + boolean_matrix = trial.tensor("flatten_1/outputs").value(0) == trial.tensor( + "dense_2/inputs" + ).value(0) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dense_2/outputs").value(0) == trial.tensor( + "dropout_1/inputs" + ).value(0) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dropout_1/outputs").value(0) == trial.tensor( + "dense_3/inputs" + ).value(0) + assert boolean_matrix.all() diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 3db87e241..ba1d2817a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -17,6 +17,7 @@ from .constants import SMDEBUG_GRADIENTS_KEY, SMDEBUG_LAYER_OUTPUTS_KEY from .tensor_ref import TensorRef, get_tf_names from .utils import ( + ModelInput, ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, @@ -441,7 +442,7 @@ def save_smdebug_logs(self, logs): if layer_outputs is not None: tensors_to_save = [] collections_to_write = {self.get_collection(CollectionKeys.OUTPUTS)} - # run the loop backwards to save layer outputs + # run the loop forwards to save layer outputs for o, l in zip(layer_outputs, self.model.layers): export_name = get_export_name_for_keras(l.name, "output") tensors_to_save.append((export_name, o)) @@ -450,7 +451,8 @@ def save_smdebug_logs(self, logs): tensors_to_save = [] collections_to_write = {self.get_collection(CollectionKeys.INPUTS)} # run the loop backwards to save layer inputs - for i, l in zip(reversed(layer_outputs), reversed(self.model.layers)): + modified_layer_outputs = [logs[ModelInput.X]] + layer_outputs + for i, l in zip(modified_layer_outputs, self.model.layers): export_name = get_export_name_for_keras(l.name, "input") tensors_to_save.append((export_name, i)) for t_name, t_value in tensors_to_save: From 06ebf847d64158fa51b828897305c0a2004ae8b8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 04:51:08 -0700 Subject: [PATCH 110/149] comments --- .../scripts/tf2_save_metrics_eager.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/tensorflow2/scripts/tf2_save_metrics_eager.py b/examples/tensorflow2/scripts/tf2_save_metrics_eager.py index dc614056d..bc4a56952 100644 --- a/examples/tensorflow2/scripts/tf2_save_metrics_eager.py +++ b/examples/tensorflow2/scripts/tf2_save_metrics_eager.py @@ -132,8 +132,12 @@ def test_keras_fit_eager(out_dir, tf_eager_mode=True): assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + assert ( + len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 + ) # 1 Model Input + 4 Layer Inputs + assert ( + len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + ) # 2 Model outputs + 4 Layer OUTPUTS assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 # We assert that all the tensors saved have a valid value @@ -183,8 +187,12 @@ def test_keras_fit_false(out_dir, tf_eager_mode=False): assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + assert ( + len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 + ) # 1 Model Input + 4 Layer Inputs + assert ( + len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 + ) # 2 Model outputs + 4 Layer OUTPUTS assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 # We assert that all the tensors saved have a valid value From 15851de00b0aa9699d0db6c76bc9fd00535b9bb8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 07:26:11 -0700 Subject: [PATCH 111/149] grad tape example --- .../scripts/tf_save_metrics_gradient_tape.py | 155 ++++++++++++++++++ smdebug/tensorflow/keras.py | 52 ++++-- 2 files changed, 189 insertions(+), 18 deletions(-) create mode 100644 examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py new file mode 100644 index 000000000..7ca5a7731 --- /dev/null +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -0,0 +1,155 @@ +""" +This file is temporary, for testing with 2.X. +We'll need to integrate a more robust testing pipeline and make this part of pytest +before pushing to master. +""" +# Standard Library +import shutil + +# Third Party +import pytest +import tensorflow.compat.v2 as tf + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.tensorflow import SaveConfig + + +@pytest.fixture(scope="function") +def out_dir(): + """ Use this method to construct an out_dir. + + Then it will be automatically cleaned up for you, passed into the test method, and we'll have + fewer folders lying around. + """ + out_dir = "/tmp/test" + shutil.rmtree(out_dir, ignore_errors=True) + return out_dir + + +def helper_keras_gradtape( + trial_dir, + save_all=False, + include_collections=None, + reduction_config=None, + save_config=None, + hook=None, + batch_size=64, + persistent=False, +): + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(batch_size) + + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + + if hook is None: + if save_config is None: + save_config = SaveConfig(save_interval=3) + + hook = smd.KerasHook( + trial_dir, + save_config=save_config, + save_all=save_all, + include_collections=include_collections, + reduction_config=reduction_config, + ) + + if not save_all and include_collections is not None: + for cname in hook.include_collections: + if cname not in include_collections: + hook.get_collection(cname).save_config = SaveConfig(end_step=0) + + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) + train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + with hook.wrap_tape(tf.GradientTape(persistent=persistent)) as tape: + logits, layer_outputs = model(data, training=True, layer_outputs=True) + loss_value = cce(labels, logits) + grads = tape.gradient(loss_value, model.variables) + hook.save_layer_outputs(layer_outputs, model) + hook.save_layer_inputs(data, layer_outputs, model) + + # By default, the resources held by a GradientTape are released as + # soon as GradientTape.gradient() method is called. To compute + # multiple gradients over the same computation, create a persistent + # gradient tape. This allows multiple calls to the gradient() method + # as resources are released when the tape object is garbage collected. + if persistent: + _ = tape.gradient(loss_value, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + acc = train_acc_metric(dataset_labels, logits) + hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) + train_acc_metric.reset_states() + + hook.close() + + +def test_keras_gradtape(out_dir): + """ + Test save all and save default collection + """ + include_collections = [ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + CollectionKeys.INPUTS, + CollectionKeys.OUTPUTS, + CollectionKeys.METRICS, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + hook = smd.KerasHook( + out_dir=out_dir, + save_config=SaveConfig(save_interval=1), + include_collections=include_collections, + ) + helper_keras_gradtape(trial_dir=out_dir, hook=hook) + + trial = smd.create_trial(path=out_dir) + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 + assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1 + + # We assert that all the tensors saved have a valid value + for tname in trial.tensor_names(): + assert trial.tensor(tname).value(5) is not None + + # We then analyse Layer Inputs and Layer Outputs + # Check that output of a layer is equal to the input of the next + boolean_matrix = trial.tensor("flatten/outputs").value(0) == trial.tensor("dense/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dense/outputs").value(0) == trial.tensor("dropout/inputs").value( + 0 + ) + assert boolean_matrix.all() + boolean_matrix = trial.tensor("dropout/outputs").value(0) == trial.tensor( + "dense_1/inputs" + ).value(0) + assert boolean_matrix.all() diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ba1d2817a..40701655a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -4,6 +4,7 @@ # Third Party import tensorflow.compat.v1 as tf from tensorflow.python.distribute import values +from tensorflow.python.framework.indexed_slices import IndexedSlices # First Party from smdebug.core.modes import ModeKeys, str_to_mode_keys @@ -433,30 +434,18 @@ def save_smdebug_logs(self, logs): for g, v in zip(gradients, self.model.trainable_variables): layer = v.name.split(":")[0] export_name = "gradients/" + layer + "Grad" + if isinstance(g, IndexedSlices): + # This class is a simple wrapper for a pair of Tensor objects + # See: https://www.tensorflow.org/api_docs/python/tf/IndexedSlices + g = g.values tensors_to_save.append((export_name, g)) collections_to_write = {self.get_collection(CollectionKeys.GRADIENTS)} for t_name, t_value in tensors_to_save: self._save_tensor(t_name, t_value, collections_to_write) elif key == SMDEBUG_LAYER_OUTPUTS_KEY: layer_outputs = logs[key] - if layer_outputs is not None: - tensors_to_save = [] - collections_to_write = {self.get_collection(CollectionKeys.OUTPUTS)} - # run the loop forwards to save layer outputs - for o, l in zip(layer_outputs, self.model.layers): - export_name = get_export_name_for_keras(l.name, "output") - tensors_to_save.append((export_name, o)) - for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) - tensors_to_save = [] - collections_to_write = {self.get_collection(CollectionKeys.INPUTS)} - # run the loop backwards to save layer inputs - modified_layer_outputs = [logs[ModelInput.X]] + layer_outputs - for i, l in zip(modified_layer_outputs, self.model.layers): - export_name = get_export_name_for_keras(l.name, "input") - tensors_to_save.append((export_name, i)) - for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + self.save_layer_outputs(layer_outputs) + self.save_layer_inputs(logs[ModelInput.X], layer_outputs) else: tensors_to_save = [] export_name = get_model_input_export_name(model_input_tensor_id) @@ -694,6 +683,33 @@ def on_test_batch_begin(self, batch, logs=None): def on_predict_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.PREDICT, logs=logs) + def _save_layer_values(self, layer_outputs, collection, model=None, inputs=None): + if model is None: + if self.model: + model = self.model + else: + return + if layer_outputs is not None: + tensors_to_save = [] + collections_to_write = {collection} + tensor_suffix = "output" + if inputs is not None: + layer_outputs = [inputs] + layer_outputs + tensor_suffix = "input" + for o, l in zip(layer_outputs, model.layers): + export_name = get_export_name_for_keras(l.name, tensor_suffix) + tensors_to_save.append((export_name, o)) + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) + + def save_layer_outputs(self, layer_outputs, model=None): + self._save_layer_values(layer_outputs, self.get_collection(CollectionKeys.OUTPUTS), model) + + def save_layer_inputs(self, x, layer_outputs, model=None): + self._save_layer_values( + layer_outputs, self.get_collection(CollectionKeys.INPUTS), model, inputs=x + ) + def _write_optimizer_variables(self): optimizer_collections = self.collection_manager.get(CollectionKeys.OPTIMIZER_VARIABLES) collections_to_save = self._get_collections_to_save_for_step() From 41ca695694f0b1f08d0cf8736cb577cdb60f786c Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 09:53:36 -0700 Subject: [PATCH 112/149] save layers --- .../scripts/tf_save_metrics_gradient_tape.py | 3 +- smdebug/tensorflow/keras.py | 30 +++++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index 7ca5a7731..709561df7 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -74,6 +74,7 @@ def helper_keras_gradtape( opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + hook.register_model(model) # Can be skipped in ZCC cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() @@ -87,8 +88,6 @@ def helper_keras_gradtape( logits, layer_outputs = model(data, training=True, layer_outputs=True) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) - hook.save_layer_outputs(layer_outputs, model) - hook.save_layer_inputs(data, layer_outputs, model) # By default, the resources held by a GradientTape are released as # soon as GradientTape.gradient() method is called. To compute diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 40701655a..e88fad65c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -66,6 +66,7 @@ def __init__( self.callable_cache = CallableCache() self.custom_tensors_to_save = dict() self.saved_layers = dict() + self.has_registered_model = False def _is_not_supported(self): if self.distribution_strategy is None: @@ -109,6 +110,11 @@ def should_save_global_step_for_mode(self, mode: str): mode_step = self.mode_steps[mode] return self.save_config.should_save_step(mode, mode_step) + def register_model(self, model): + self.model = model + self._wrap_model_with_input_output_saver() + self.has_registered_model = True + def _get_matching_collections( self, mode, tensor, tensor_type, ts_name, is_input_to_model=False, is_output_of_model=False ): @@ -424,7 +430,11 @@ def save_smdebug_logs(self, logs): tensors_to_save = [] export_name = get_model_output_export_name(key) tensors_to_save.append((export_name, logs[key])) - collections_to_write = {self.get_collection(CollectionKeys.OUTPUTS)} + collections_to_write = ( + {self.get_collection(CollectionKeys.OUTPUTS)} + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + else set() + ) for t_name, t_value in tensors_to_save: self._save_tensor(t_name, t_value, collections_to_write) elif key == SMDEBUG_GRADIENTS_KEY: @@ -451,7 +461,11 @@ def save_smdebug_logs(self, logs): export_name = get_model_input_export_name(model_input_tensor_id) model_input_tensor_id += 1 tensors_to_save.append((export_name, logs[key])) - collections_to_write = {self.get_collection(CollectionKeys.INPUTS)} + collections_to_write = ( + {self.get_collection(CollectionKeys.OUTPUTS)} + if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + else set() + ) for t_name, t_value in tensors_to_save: self._save_tensor(t_name, t_value, collections_to_write) @@ -477,8 +491,8 @@ def _save_metrics(self, batch, logs, force_save=False): self._add_metric(metric_name=key) self._save_for_tensor(key, logs[key], check_before_write=False) - def _save_layer_input_and_outputs(self): - if is_tf_version_2x() is False or self.model.run_eagerly is False: + def _save_layer_input_and_outputs(self, grad_tape=False): + if is_tf_version_2x() is False or (self.model.run_eagerly is False and grad_tape is False): # This function only works when the run_eagerly is True return for layer_name in self.saved_layers: @@ -623,6 +637,8 @@ def on_predict_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.PREDICT) def _wrap_model_with_input_output_saver(self): + if self.has_registered_model: + return for layer in self.model.layers: layer._hooks = [] layer.call = get_layer_call_fn(layer) @@ -691,7 +707,8 @@ def _save_layer_values(self, layer_outputs, collection, model=None, inputs=None) return if layer_outputs is not None: tensors_to_save = [] - collections_to_write = {collection} + step_collections = self._get_collections_to_save_for_step() + collections_to_write = {collection} if collection in step_collections else set() tensor_suffix = "output" if inputs is not None: layer_outputs = [inputs] + layer_outputs @@ -933,9 +950,10 @@ def run(*args, **kwargs): check_before_write=True, ) + self._write_optimizer_variables() + self._save_layer_input_and_outputs(grad_tape=True) if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")): return grads - self._write_optimizer_variables() self._add_metric(metric_name="loss", metric_value=loss) if self._is_collection_being_saved_for_step(CollectionKeys.LOSSES): self._initialize_writers(only_initialize_if_missing=True) From af1e411c14631f3da0fe0de183f29b693cb52a26 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 09:54:23 -0700 Subject: [PATCH 113/149] rename --- .../scripts/{tf2_save_metrics_eager.py => tf2_save_metrics.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/tensorflow2/scripts/{tf2_save_metrics_eager.py => tf2_save_metrics.py} (100%) diff --git a/examples/tensorflow2/scripts/tf2_save_metrics_eager.py b/examples/tensorflow2/scripts/tf2_save_metrics.py similarity index 100% rename from examples/tensorflow2/scripts/tf2_save_metrics_eager.py rename to examples/tensorflow2/scripts/tf2_save_metrics.py From 8cdd13ec76e9b0729f6cca973f9fb736d83a5b99 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 11:00:28 -0700 Subject: [PATCH 114/149] change boolean logic --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e88fad65c..bf4be28d7 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -492,7 +492,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._save_for_tensor(key, logs[key], check_before_write=False) def _save_layer_input_and_outputs(self, grad_tape=False): - if is_tf_version_2x() is False or (self.model.run_eagerly is False and grad_tape is False): + if is_tf_version_2x() is False or (grad_tape is False and self.model.run_eagerly is False): # This function only works when the run_eagerly is True return for layer_name in self.saved_layers: From 03e4f18638ee848779249931d828b3aa1c2a2b80 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 26 Jun 2020 11:46:24 -0700 Subject: [PATCH 115/149] bug fix --- smdebug/tensorflow/keras.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index bf4be28d7..82cf8411b 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -462,8 +462,8 @@ def save_smdebug_logs(self, logs): model_input_tensor_id += 1 tensors_to_save.append((export_name, logs[key])) collections_to_write = ( - {self.get_collection(CollectionKeys.OUTPUTS)} - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + {self.get_collection(CollectionKeys.INPUTS)} + if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) else set() ) for t_name, t_value in tensors_to_save: From 2660a76ad3aea546336db32921c0e5f80c02706a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 29 Jun 2020 16:31:31 -0700 Subject: [PATCH 116/149] retrigger CI From fccf7e830dce6875d6d95e15cc97c64f7e37654b Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 2 Jul 2020 12:36:30 -0700 Subject: [PATCH 117/149] fix flag --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 82cf8411b..f16340a0f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -416,7 +416,7 @@ def _save_tensor(self, tensor_name, tensor_value, collections_to_write): if isinstance(collection, str): collection = self.get_collection(collection) collection.set_tensor_ref(tensor_ref) - self._save_for_tensor(tensor_name, t, check_before_write=False) + self._save_for_tensor(tensor_name, t, check_before_write=True) def save_smdebug_logs(self, logs): if logs is None: From f221f742f37514ea06aa7ccd1a5892220fe929d8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 2 Jul 2020 17:45:57 -0700 Subject: [PATCH 118/149] duplicate set --- smdebug/core/collection.py | 1 + smdebug/tensorflow/keras.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/smdebug/core/collection.py b/smdebug/core/collection.py index 77d9a0212..0c42d161c 100644 --- a/smdebug/core/collection.py +++ b/smdebug/core/collection.py @@ -26,6 +26,7 @@ class CollectionKeys: GRADIENTS = "gradients" LOSSES = "losses" BIASES = "biases" + LAYERS = "layers" # Use this collection to log scalars other than losses/metrics to SageMaker. # Mainly for Tensorflow. For all other frameworks, call save_scalar() API diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index f16340a0f..2d5d914c7 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -397,6 +397,10 @@ def _save_custom_tensors_post_step(self): def _save_tensor(self, tensor_name, tensor_value, collections_to_write): if isinstance(collections_to_write, set) is False: collections_to_write = {collections_to_write} + # Since this function modifies the set, there is a possibility + # of bugs if calling functions attempt to re-use the set passed + # to this function + collections_to_write = collections_to_write.copy() collections_to_save = self._get_collections_to_save_for_step() for collection in collections_to_save: if match_inc(tensor_name, collection.include_regex): @@ -500,18 +504,18 @@ def _save_layer_input_and_outputs(self, grad_tape=False): tensor = self.saved_layers[layer_name].layer_input export_name = get_export_name_for_keras(layer_name, tensor_type="input", tensor=tensor) input_collection = ( - {self.get_collection(CollectionKeys.INPUTS)} - if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) + {self.get_collection(CollectionKeys.LAYERS)} + if self._is_collection_being_saved_for_step(CollectionKeys.LAYERS) else set() ) self._save_tensor(export_name, tensor.numpy(), input_collection) # Save Output tensor = self.saved_layers[layer_name].layer_output export_name = get_export_name_for_keras(layer_name, tensor_type="output", tensor=tensor) - self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + self._is_collection_being_saved_for_step(CollectionKeys.LAYERS) output_collection = ( - {self.get_collection(CollectionKeys.OUTPUTS)} - if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) + {self.get_collection(CollectionKeys.LAYERS)} + if self._is_collection_being_saved_for_step(CollectionKeys.LAYERS) else set() ) self._save_tensor(export_name, tensor.numpy(), output_collection) @@ -720,11 +724,11 @@ def _save_layer_values(self, layer_outputs, collection, model=None, inputs=None) self._save_tensor(t_name, t_value, collections_to_write) def save_layer_outputs(self, layer_outputs, model=None): - self._save_layer_values(layer_outputs, self.get_collection(CollectionKeys.OUTPUTS), model) + self._save_layer_values(layer_outputs, self.get_collection(CollectionKeys.LAYERS), model) def save_layer_inputs(self, x, layer_outputs, model=None): self._save_layer_values( - layer_outputs, self.get_collection(CollectionKeys.INPUTS), model, inputs=x + layer_outputs, self.get_collection(CollectionKeys.LAYERS), model, inputs=x ) def _write_optimizer_variables(self): From 480db00dee674b41df2415d2c538a0b7c276d678 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 7 Jul 2020 01:01:39 -0700 Subject: [PATCH 119/149] pred --- smdebug/tensorflow/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 7cf063bbd..2c73e7b33 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -25,9 +25,9 @@ class ModelOutput: def get_model_output_export_name(key): export_names = { - ModelOutput.Y_PRED: "y_pred", - ModelOutput.Y: "y", - ModelOutput.VAL_Y: "val_y", + ModelOutput.Y_PRED: "y_predictions", + ModelOutput.Y: "y_labels", + ModelOutput.VAL_Y: "val_y_;an", ModelOutput.VAL_Y_PRED: "val_y_pred", } return export_names[key] From c0817b9d2170f1798cc86b6cbdfa7a39de9e11ba Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 8 Jul 2020 00:44:34 -0700 Subject: [PATCH 120/149] nit --- examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py | 1 + smdebug/core/collection.py | 1 + smdebug/tensorflow/utils.py | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index 709561df7..3635cda4a 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -112,6 +112,7 @@ def test_keras_gradtape(out_dir): CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, + CollectionKeys.LAYERS, CollectionKeys.LOSSES, CollectionKeys.INPUTS, CollectionKeys.OUTPUTS, diff --git a/smdebug/core/collection.py b/smdebug/core/collection.py index 0c42d161c..6f33a92a0 100644 --- a/smdebug/core/collection.py +++ b/smdebug/core/collection.py @@ -76,6 +76,7 @@ class CollectionKeys: CollectionKeys.METRICS, CollectionKeys.INPUTS, CollectionKeys.OUTPUTS, + CollectionKeys.LAYERS, CollectionKeys.SM_METRICS, CollectionKeys.OPTIMIZER_VARIABLES, } diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 2c73e7b33..9a79df83b 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -27,8 +27,8 @@ def get_model_output_export_name(key): export_names = { ModelOutput.Y_PRED: "y_predictions", ModelOutput.Y: "y_labels", - ModelOutput.VAL_Y: "val_y_;an", - ModelOutput.VAL_Y_PRED: "val_y_pred", + ModelOutput.VAL_Y: "val_y_labels", + ModelOutput.VAL_Y_PRED: "val_y_predictions", } return export_names[key] From 80a65c7283d4a174ce3f7a8541c78afb20255c20 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 10 Jul 2020 08:21:28 +0000 Subject: [PATCH 121/149] update --- .../tensorflow2/scripts/tf_save_metrics_gradient_tape.py | 8 +++++--- smdebug/tensorflow/keras.py | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index 3635cda4a..817795f49 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -85,8 +85,9 @@ def helper_keras_gradtape( dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape(tf.GradientTape(persistent=persistent)) as tape: - logits, layer_outputs = model(data, training=True, layer_outputs=True) + logits = model(data, training=True) loss_value = cce(labels, logits) + hook.save_custom_tensor("y_labels", labels, 'outputs') grads = tape.gradient(loss_value, model.variables) # By default, the resources held by a GradientTape are released as @@ -130,8 +131,9 @@ def test_keras_gradtape(out_dir): assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.LAYERS)) == 8 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1 diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2d5d914c7..e71b6ae59 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -386,6 +386,8 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): self.tensor_to_collections[metric_name] = {coll} def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): + if isinstance(collections_to_write, str): + collections_to_write = [collections_to_write] for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) @@ -956,6 +958,7 @@ def run(*args, **kwargs): self._write_optimizer_variables() self._save_layer_input_and_outputs(grad_tape=True) + self._save_custom_tensors_post_step() if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")): return grads self._add_metric(metric_name="loss", metric_value=loss) From e7cb92aac56318be26a02935686b8c5ed0515a51 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 14 Jul 2020 23:24:55 -0700 Subject: [PATCH 122/149] rename default collection --- smdebug/tensorflow/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 9a79df83b..ff692fead 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -25,10 +25,10 @@ class ModelOutput: def get_model_output_export_name(key): export_names = { - ModelOutput.Y_PRED: "y_predictions", - ModelOutput.Y: "y_labels", - ModelOutput.VAL_Y: "val_y_labels", - ModelOutput.VAL_Y_PRED: "val_y_predictions", + ModelOutput.Y_PRED: "predictions", + ModelOutput.Y: "labels", + ModelOutput.VAL_Y: "labels", + ModelOutput.VAL_Y_PRED: "predictions", } return export_names[key] From 39b65df248f927073a168fb2d3e122a919cf4500 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 14 Jul 2020 23:51:15 -0700 Subject: [PATCH 123/149] model inputs --- smdebug/tensorflow/keras.py | 2 +- smdebug/tensorflow/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e71b6ae59..cd53a93e7 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -464,7 +464,7 @@ def save_smdebug_logs(self, logs): self.save_layer_inputs(logs[ModelInput.X], layer_outputs) else: tensors_to_save = [] - export_name = get_model_input_export_name(model_input_tensor_id) + export_name = get_model_input_export_name() model_input_tensor_id += 1 tensors_to_save.append((export_name, logs[key])) collections_to_write = ( diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index ff692fead..5f6f7caa8 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -40,8 +40,8 @@ class ModelInput: ModelInputs = {ModelInput.X} -def get_model_input_export_name(tensor_id): - return f"model_input:{tensor_id}" +def get_model_input_export_name(): + return f"model_input" class TFDistributionStrategy(Enum): From ca68f77c57e97e1278994b5714c8b019d3b0bfc7 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 15 Jul 2020 12:04:14 -0700 Subject: [PATCH 124/149] lint --- examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index 817795f49..db5284373 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -87,7 +87,7 @@ def helper_keras_gradtape( with hook.wrap_tape(tf.GradientTape(persistent=persistent)) as tape: logits = model(data, training=True) loss_value = cce(labels, logits) - hook.save_custom_tensor("y_labels", labels, 'outputs') + hook.save_custom_tensor("y_labels", labels, "outputs") grads = tape.gradient(loss_value, model.variables) # By default, the resources held by a GradientTape are released as From 281011d02f5f26fd087eed7b0047b14dc6cba65f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 15 Jul 2020 12:35:51 -0700 Subject: [PATCH 125/149] update tests --- tests/tensorflow2/test_keras.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 54a695481..ae9a30c50 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -414,8 +414,8 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: assert len(trial.tensor_names()) == (21 if is_tf_2_2() else 22) - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 0 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 0 else: assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 @@ -658,8 +658,8 @@ def test_keras_fit_pure_eager(out_dir, tf_eager_mode): assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 - assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 4 - assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 4 + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 0 + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 0 @pytest.mark.skip # skip until aws tf update From 74de9c90202b098128d7c7b4bf615e432b08f5af Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 15 Jul 2020 13:17:23 -0700 Subject: [PATCH 126/149] modify assert --- tests/tensorflow/hooks/test_collection_defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow/hooks/test_collection_defaults.py b/tests/tensorflow/hooks/test_collection_defaults.py index b5bf23b7b..8b00d6fe9 100644 --- a/tests/tensorflow/hooks/test_collection_defaults.py +++ b/tests/tensorflow/hooks/test_collection_defaults.py @@ -107,5 +107,5 @@ def test_get_custom_and_default_collections(): assert len(custom_collections) == 1 assert ( - len(default_collections) == 8 + 3 + len(default_collections) == 8 + 4 ) # Addtional three collections are: all, default and sm_metrics From 9abe494187ec610440b16dfeb7572f3aa9bb144a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 16 Jul 2020 00:06:31 -0700 Subject: [PATCH 127/149] modify assert --- examples/tensorflow2/scripts/tf2_save_metrics.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/tensorflow2/scripts/tf2_save_metrics.py b/examples/tensorflow2/scripts/tf2_save_metrics.py index bc4a56952..7f473dcbf 100644 --- a/examples/tensorflow2/scripts/tf2_save_metrics.py +++ b/examples/tensorflow2/scripts/tf2_save_metrics.py @@ -132,12 +132,8 @@ def test_keras_fit_eager(out_dir, tf_eager_mode=True): assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 - assert ( - len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 - ) # 1 Model Input + 4 Layer Inputs - assert ( - len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 - ) # 2 Model outputs + 4 Layer OUTPUTS + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 # 1 Model Input + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 # 2 Model outputs assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 # We assert that all the tensors saved have a valid value @@ -187,12 +183,8 @@ def test_keras_fit_false(out_dir, tf_eager_mode=False): assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 - assert ( - len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 5 - ) # 1 Model Input + 4 Layer Inputs - assert ( - len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 6 - ) # 2 Model outputs + 4 Layer OUTPUTS + assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 1 # 1 Model Input + assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 2 # 2 Model outputs assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 # We assert that all the tensors saved have a valid value From 33c21c0aa7ec27007da1647e498fddfc2829f368 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 16 Jul 2020 00:13:02 -0700 Subject: [PATCH 128/149] save Layers --- examples/tensorflow2/scripts/tf2_save_metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/tensorflow2/scripts/tf2_save_metrics.py b/examples/tensorflow2/scripts/tf2_save_metrics.py index 7f473dcbf..6ce11dd79 100644 --- a/examples/tensorflow2/scripts/tf2_save_metrics.py +++ b/examples/tensorflow2/scripts/tf2_save_metrics.py @@ -114,6 +114,7 @@ def test_keras_fit_eager(out_dir, tf_eager_mode=True): CollectionKeys.GRADIENTS, CollectionKeys.INPUTS, CollectionKeys.OUTPUTS, + CollectionKeys.LAYERS, CollectionKeys.OPTIMIZER_VARIABLES, ] hook = smd.KerasHook(out_dir=out_dir, include_collections=test_include_collections) @@ -165,6 +166,7 @@ def test_keras_fit_false(out_dir, tf_eager_mode=False): CollectionKeys.GRADIENTS, CollectionKeys.INPUTS, CollectionKeys.OUTPUTS, + CollectionKeys.LAYERS, CollectionKeys.OPTIMIZER_VARIABLES, ] hook = smd.KerasHook(out_dir=out_dir, include_collections=test_include_collections) From 7bd87c8f7584cbda73be5427256e062f2f5e5140 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 17 Jul 2020 00:58:34 -0700 Subject: [PATCH 129/149] clear saved collections after saving --- smdebug/tensorflow/keras.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index cd53a93e7..0ea27e617 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -106,11 +106,13 @@ def _is_not_supported(self): return not self._hook_supported def should_save_global_step_for_mode(self, mode: str): + # This function is called by the hook in the AWS TF codebase mode = str_to_mode_keys(mode) mode_step = self.mode_steps[mode] return self.save_config.should_save_step(mode, mode_step) def register_model(self, model): + # This function is called by the hook in the AWS TF codebase self.model = model self._wrap_model_with_input_output_saver() self.has_registered_model = True @@ -395,18 +397,20 @@ def _save_custom_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) + # Clear saved custom tensors + self.custom_tensors_to_save.clear() - def _save_tensor(self, tensor_name, tensor_value, collections_to_write): - if isinstance(collections_to_write, set) is False: - collections_to_write = {collections_to_write} + def _save_tensor(self, tensor_name, tensor_value, collections): + if isinstance(collections, set) is False: + collections = {collections} # Since this function modifies the set, there is a possibility # of bugs if calling functions attempt to re-use the set passed # to this function - collections_to_write = collections_to_write.copy() + collections_to_write = collections.copy() collections_to_save = self._get_collections_to_save_for_step() - for collection in collections_to_save: - if match_inc(tensor_name, collection.include_regex): - collections_to_write.add(collection) + for c in collections_to_save: + if match_inc(tensor_name, c.include_regex): + collections_to_write.add(c) self._initialize_writers(only_initialize_if_missing=True) tensor_refs = [] if isinstance(tensor_value, values.PerReplica): @@ -428,8 +432,6 @@ def save_smdebug_logs(self, logs): if logs is None: return - model_input_tensor_id = 0 - for key in logs: if "smdebug_" in key: if key in ModelOutputs: @@ -465,7 +467,6 @@ def save_smdebug_logs(self, logs): else: tensors_to_save = [] export_name = get_model_input_export_name() - model_input_tensor_id += 1 tensors_to_save.append((export_name, logs[key])) collections_to_write = ( {self.get_collection(CollectionKeys.INPUTS)} From 651d6ea972353f4c53c691189eca2983fc4c45d7 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 17 Jul 2020 08:50:42 -0700 Subject: [PATCH 130/149] refactor --- smdebug/tensorflow/keras.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 0ea27e617..9ca11b79c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -433,9 +433,11 @@ def save_smdebug_logs(self, logs): return for key in logs: + tensors_to_save = [] + collections_to_write = set() if "smdebug_" in key: + # Save Model Outputs if key in ModelOutputs: - tensors_to_save = [] export_name = get_model_output_export_name(key) tensors_to_save.append((export_name, logs[key])) collections_to_write = ( @@ -443,10 +445,8 @@ def save_smdebug_logs(self, logs): if self._is_collection_being_saved_for_step(CollectionKeys.OUTPUTS) else set() ) - for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + # Save Gradients elif key == SMDEBUG_GRADIENTS_KEY: - tensors_to_save = [] gradients = logs[key] if gradients is not None: for g, v in zip(gradients, self.model.trainable_variables): @@ -458,14 +458,13 @@ def save_smdebug_logs(self, logs): g = g.values tensors_to_save.append((export_name, g)) collections_to_write = {self.get_collection(CollectionKeys.GRADIENTS)} - for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + # Save Intermediate Layers elif key == SMDEBUG_LAYER_OUTPUTS_KEY: layer_outputs = logs[key] self.save_layer_outputs(layer_outputs) self.save_layer_inputs(logs[ModelInput.X], layer_outputs) - else: - tensors_to_save = [] + # Save Model Inputs + elif key in ModelInput: export_name = get_model_input_export_name() tensors_to_save.append((export_name, logs[key])) collections_to_write = ( @@ -473,8 +472,8 @@ def save_smdebug_logs(self, logs): if self._is_collection_being_saved_for_step(CollectionKeys.INPUTS) else set() ) - for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + for t_name, t_value in tensors_to_save: + self._save_tensor(t_name, t_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From 1aaabe775204214d3a99136350084e4ba0445b89 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 17 Jul 2020 14:18:17 -0700 Subject: [PATCH 131/149] nit --- smdebug/tensorflow/keras.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 9ca11b79c..17eff15d2 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -19,6 +19,7 @@ from .tensor_ref import TensorRef, get_tf_names from .utils import ( ModelInput, + ModelInputs, ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, @@ -464,7 +465,7 @@ def save_smdebug_logs(self, logs): self.save_layer_outputs(layer_outputs) self.save_layer_inputs(logs[ModelInput.X], layer_outputs) # Save Model Inputs - elif key in ModelInput: + elif key in ModelInputs: export_name = get_model_input_export_name() tensors_to_save.append((export_name, logs[key])) collections_to_write = ( From 6d3b733a3fb4701a495388e7c0933de7623fc091 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 04:15:19 -0700 Subject: [PATCH 132/149] pr comments --- .../scripts/tf_save_metrics_gradient_tape.py | 2 +- smdebug/core/modes.py | 10 +++++----- smdebug/tensorflow/keras.py | 17 ++++++++++++----- tests/tensorflow2/test_keras.py | 8 ++++---- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index db5284373..08bcf0b53 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -87,7 +87,7 @@ def helper_keras_gradtape( with hook.wrap_tape(tf.GradientTape(persistent=persistent)) as tape: logits = model(data, training=True) loss_value = cce(labels, logits) - hook.save_custom_tensor("y_labels", labels, "outputs") + hook.save_tensor("y_labels", labels, "outputs") grads = tape.gradient(loss_value, model.variables) # By default, the resources held by a GradientTape are released as diff --git a/smdebug/core/modes.py b/smdebug/core/modes.py index 809df1ce9..16c2f39de 100644 --- a/smdebug/core/modes.py +++ b/smdebug/core/modes.py @@ -16,14 +16,14 @@ class ModeKeys(Enum): MODE_PLUGIN_NAME = "mode" -def str_to_mode_keys(s): - if s == "train": +def str_to_mode_keys(mode_str): + if mode_str == "train": return ModeKeys.TRAIN - elif s == "eval": + elif mode_str == "eval": return ModeKeys.EVAL - elif s == "predict": + elif mode_str == "predict": return ModeKeys.PREDICT - elif s == "global": + elif mode_str == "global": return ModeKeys.GLOBAL else: raise Exception("Invalid mode") diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 17eff15d2..d2b5846be 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -65,7 +65,9 @@ def __init__( self.tensor_refs_to_save_this_step = set() self._fetches_added = set() self.callable_cache = CallableCache() - self.custom_tensors_to_save = dict() + self.custom_tensors_to_save = ( + dict() + ) # stores tensors custom tensors saved by users every step self.saved_layers = dict() self.has_registered_model = False @@ -108,12 +110,16 @@ def _is_not_supported(self): def should_save_global_step_for_mode(self, mode: str): # This function is called by the hook in the AWS TF codebase + # It returns a boolean value indicating to AWS TF if a step is + # Being saved for the step mode = str_to_mode_keys(mode) mode_step = self.mode_steps[mode] return self.save_config.should_save_step(mode, mode_step) def register_model(self, model): # This function is called by the hook in the AWS TF codebase + # It attaches a hook to every layer of the model to capture + # layer values self.model = model self._wrap_model_with_input_output_saver() self.has_registered_model = True @@ -388,13 +394,13 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_custom_tensor(self, tensor_name, tensor_value, collections_to_write): + def save_tensor(self, tensor_name, tensor_value, collections_to_write): if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - def _save_custom_tensors_post_step(self): + def _save_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) @@ -499,6 +505,7 @@ def _save_metrics(self, batch, logs, force_save=False): self._save_for_tensor(key, logs[key], check_before_write=False) def _save_layer_input_and_outputs(self, grad_tape=False): + # Iterates over all the saved layers for input and output values if is_tf_version_2x() is False or (grad_tape is False and self.model.run_eagerly is False): # This function only works when the run_eagerly is True return @@ -529,7 +536,7 @@ def _save_tensors_post_step(self, batch, logs): self._save_metrics(batch, logs) self.save_smdebug_logs(logs) self._save_layer_input_and_outputs() - self._save_custom_tensors_post_step() + self._save_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -959,7 +966,7 @@ def run(*args, **kwargs): self._write_optimizer_variables() self._save_layer_input_and_outputs(grad_tape=True) - self._save_custom_tensors_post_step() + self._save_tensors_post_step() if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")): return grads self._add_metric(metric_name="loss", metric_value=loss) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ae9a30c50..4d88d0466 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -708,15 +708,15 @@ def test_save_gradients(out_dir, tf_eager_mode): assert output.value(0) is not None -def test_save_custom_tensors(out_dir, tf_eager_mode): +def test_save_tensors(out_dir, tf_eager_mode): include_collections = ["custom_coll"] hook = smd.KerasHook(out_dir=out_dir, include_collections=include_collections) t1 = tf.constant([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) t2 = tf.Variable([5 + 4j, 6 + 1j]) t3 = tf.Variable([False, False, False, True]) - hook.save_custom_tensor("custom_tensor_1", t1, include_collections) - hook.save_custom_tensor("custom_tensor_2", t2, include_collections) - hook.save_custom_tensor("custom_tensor_3", t3, include_collections) + hook.save_tensor("custom_tensor_1", t1, include_collections) + hook.save_tensor("custom_tensor_2", t2, include_collections) + hook.save_tensor("custom_tensor_3", t3, include_collections) helper_keras_fit( trial_dir=out_dir, From 0f0877341974fcc29e5affb856db3fabaef22af9 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 04:31:40 -0700 Subject: [PATCH 133/149] save tensor api --- examples/tensorflow2/scripts/tf_keras_gradienttape.py | 4 +++- .../tensorflow2/scripts/tf_save_metrics_gradient_tape.py | 4 +++- smdebug/tensorflow/keras.py | 7 +++++++ tests/tensorflow2/test_keras.py | 4 +++- .../test_tensorflow2_gradtape_integration.py | 4 +++- 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_keras_gradienttape.py b/examples/tensorflow2/scripts/tf_keras_gradienttape.py index 98c87cd83..404639a3b 100644 --- a/examples/tensorflow2/scripts/tf_keras_gradienttape.py +++ b/examples/tensorflow2/scripts/tf_keras_gradienttape.py @@ -65,7 +65,9 @@ def train(batch_size, n_epochs, model, hook): optimizer.apply_gradients(zip(grads, model.trainable_variables)) acc = train_acc_metric(dataset_labels, logits) # save metrics value - hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) + hook.save_tensor( + tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics" + ) values = [("Accuracy", train_acc_metric.result())] progBar.update(idx * batch_size, values=values) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index 08bcf0b53..d9d9ec284 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -99,7 +99,9 @@ def helper_keras_gradtape( _ = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) - hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) + hook.save_tensor( + tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics" + ) train_acc_metric.reset_states() hook.close() diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d2b5846be..56e08c819 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -395,6 +395,11 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): self.tensor_to_collections[metric_name] = {coll} def save_tensor(self, tensor_name, tensor_value, collections_to_write): + if ( + not ((isinstance(tensor_value, tf.Tensor)) and hasattr(tensor_value, "numpy")) + ) or self._is_not_supported(): + return + if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] for collection in collections_to_write: @@ -1026,6 +1031,8 @@ def record_tensor_value(self, tensor_name, tensor_value): ) or self._is_not_supported(): return + self.logger.warning("This function has been deprecated. Please use the save_tensor API ") + self._add_metric(metric_name=tensor_name, metric_value=tensor_value) if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): self._initialize_writers(only_initialize_if_missing=True) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4d88d0466..20038f139 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -173,7 +173,9 @@ def helper_keras_gradtape( _ = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) - hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) + hook.save_tensor( + tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics" + ) train_acc_metric.reset_states() hook.close() diff --git a/tests/zero_code_change/test_tensorflow2_gradtape_integration.py b/tests/zero_code_change/test_tensorflow2_gradtape_integration.py index 4df4fc49c..b4a5d85c8 100644 --- a/tests/zero_code_change/test_tensorflow2_gradtape_integration.py +++ b/tests/zero_code_change/test_tensorflow2_gradtape_integration.py @@ -67,7 +67,9 @@ def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_conte grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) - hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) + hook.save_tensor( + tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics" + ) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) From 3015cae906cd331627867e0b63a541887cba404d Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 06:07:23 -0700 Subject: [PATCH 134/149] revert typo --- smdebug/tensorflow/keras.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 56e08c819..1b46976ce 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -405,7 +405,7 @@ def save_tensor(self, tensor_name, tensor_value, collections_to_write): for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - def _save_tensors_post_step(self): + def _save_custom_tensors_post_step(self): for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) @@ -541,7 +541,7 @@ def _save_tensors_post_step(self, batch, logs): self._save_metrics(batch, logs) self.save_smdebug_logs(logs) self._save_layer_input_and_outputs() - self._save_tensors_post_step() + self._save_custom_tensors_post_step() if is_tf_version_2x() and tf.executing_eagerly(): for tensor_ref in self.tensor_refs_to_save_this_step: @@ -971,7 +971,7 @@ def run(*args, **kwargs): self._write_optimizer_variables() self._save_layer_input_and_outputs(grad_tape=True) - self._save_tensors_post_step() + self._save_custom_tensors_post_step() if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")): return grads self._add_metric(metric_name="loss", metric_value=loss) From cca7fea835d9532cfd875fc9b5368d7b8d78b085 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 07:51:44 -0700 Subject: [PATCH 135/149] save custom tensors --- .../tensorflow2/scripts/tf_save_metrics_gradient_tape.py | 4 +++- smdebug/tensorflow/keras.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py index d9d9ec284..694415985 100644 --- a/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py +++ b/examples/tensorflow2/scripts/tf_save_metrics_gradient_tape.py @@ -100,7 +100,9 @@ def helper_keras_gradtape( opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.save_tensor( - tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics" + tensor_name="accuracy", + tensor_value=acc, + collections_to_write=CollectionKeys.METRICS, ) train_acc_metric.reset_states() diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1b46976ce..007e3d62e 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -394,14 +394,17 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_tensor(self, tensor_name, tensor_value, collections_to_write): + def save_tensor(self, tensor_name, tensor_value, collections_to_write=None): if ( not ((isinstance(tensor_value, tf.Tensor)) and hasattr(tensor_value, "numpy")) ) or self._is_not_supported(): return + if collections_to_write is None: + collections_to_write = "default" if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] + for collection in collections_to_write: self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) @@ -896,6 +899,7 @@ def run(*args, **kwargs): self.worker = self._get_worker_name() if self.writer is not None or len(self.writer_map): + self._save_custom_tensors_post_step() self._close_writers() if not self.prepared_collections: @@ -971,7 +975,6 @@ def run(*args, **kwargs): self._write_optimizer_variables() self._save_layer_input_and_outputs(grad_tape=True) - self._save_custom_tensors_post_step() if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")): return grads self._add_metric(metric_name="loss", metric_value=loss) From bbf1bf691d73e3f7b7f9f7e3bd10f2247d80756f Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 12:40:09 -0700 Subject: [PATCH 136/149] pr comments --- smdebug/tensorflow/keras.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 007e3d62e..b6aa4395b 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -409,6 +409,8 @@ def save_tensor(self, tensor_name, tensor_value, collections_to_write=None): self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) def _save_custom_tensors_post_step(self): + # This saves all the values of custom tensors + # that the user has saved with the save_tensor api for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] self._save_tensor(tensor_name, tensor_value, collection_names) @@ -465,8 +467,10 @@ def save_smdebug_logs(self, logs): gradients = logs[key] if gradients is not None: for g, v in zip(gradients, self.model.trainable_variables): - layer = v.name.split(":")[0] - export_name = "gradients/" + layer + "Grad" + layer_name = v.name + if layer_name.split(":") > 1: + layer_name = layer_name.split(":")[0] + export_name = "gradients/" + layer_name + "Grad" if isinstance(g, IndexedSlices): # This class is a simple wrapper for a pair of Tensor objects # See: https://www.tensorflow.org/api_docs/python/tf/IndexedSlices From a32a8d471c2f3e9c1c66734e7bd6d350f14669e7 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 14:16:20 -0700 Subject: [PATCH 137/149] len --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index b6aa4395b..23b160bd8 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -468,7 +468,7 @@ def save_smdebug_logs(self, logs): if gradients is not None: for g, v in zip(gradients, self.model.trainable_variables): layer_name = v.name - if layer_name.split(":") > 1: + if len(layer_name.split(":")) > 1: layer_name = layer_name.split(":")[0] export_name = "gradients/" + layer_name + "Grad" if isinstance(g, IndexedSlices): From 259414aa8a2355d2babc4a0f81553a406eb54fa6 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 22 Jul 2020 20:35:42 -0700 Subject: [PATCH 138/149] default --- smdebug/tensorflow/keras.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 23b160bd8..8f57dddbd 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -394,13 +394,11 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_tensor(self, tensor_name, tensor_value, collections_to_write=None): + def save_tensor(self, tensor_name, tensor_value, collections_to_write="default"): if ( not ((isinstance(tensor_value, tf.Tensor)) and hasattr(tensor_value, "numpy")) ) or self._is_not_supported(): return - if collections_to_write is None: - collections_to_write = "default" if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] From b1ad7a0e9fde8808e1effeed41a13d9a0e8259ae Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 23 Jul 2020 14:48:39 -0700 Subject: [PATCH 139/149] save smdebug logs --- smdebug/tensorflow/keras.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 8f57dddbd..ca2449c46 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1004,6 +1004,10 @@ def run(*args, **kwargs): return run + def save_tape_logs(self, model_inputs=None, outputs=None): + logs = {"smdebug_y": outputs, "smdebug_x": model_inputs} + self.save_smdebug_logs(logs) + def wrap_tape(self, tape): """ Wrapping your GradientTape with this method enables finding gradient tensors and optimizer From d3b54c31408197c541d94b0ae9a9bba296b01a87 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 23 Jul 2020 15:12:12 -0700 Subject: [PATCH 140/149] comments --- smdebug/tensorflow/keras.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index ca2449c46..14f6d21fb 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1005,6 +1005,12 @@ def run(*args, **kwargs): return run def save_tape_logs(self, model_inputs=None, outputs=None): + """ + called by AWS TF to save model inputs and outputs + :param model_inputs: + :param outputs: + :return: + """ logs = {"smdebug_y": outputs, "smdebug_x": model_inputs} self.save_smdebug_logs(logs) From fb548a969dd7f02caadc0419c006d836b46f4a40 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Thu, 23 Jul 2020 15:22:08 -0700 Subject: [PATCH 141/149] update --- smdebug/core/tfevent/util.py | 2 ++ smdebug/core/utils.py | 8 ++++++++ smdebug/tensorflow/keras.py | 17 ++++------------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/smdebug/core/tfevent/util.py b/smdebug/core/tfevent/util.py index 3b9136acb..0708069ab 100644 --- a/smdebug/core/tfevent/util.py +++ b/smdebug/core/tfevent/util.py @@ -61,6 +61,8 @@ def make_numpy_array(x): return np.array([x]) elif isinstance(x, tuple): return np.asarray(x, dtype=x.dtype) + elif isinstance(x, list): + return np.asarray(x) else: raise TypeError( "_make_numpy_array only accepts input types of numpy.ndarray, scalar," diff --git a/smdebug/core/utils.py b/smdebug/core/utils.py index 02ca881c2..af871c527 100644 --- a/smdebug/core/utils.py +++ b/smdebug/core/utils.py @@ -297,6 +297,14 @@ def remove_file_if_exists(file_path): os.remove(file_path) +def validate_custom_tensor_value(tensor_value, make_numpy_fn): + try: + make_numpy_fn(tensor_value) + except TypeError: + return False + return True + + class SagemakerSimulator(object): """ Creates an environment variable pointing to a JSON config file, and creates the config file. diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 14f6d21fb..0110fefd6 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -7,8 +7,8 @@ from tensorflow.python.framework.indexed_slices import IndexedSlices # First Party -from smdebug.core.modes import ModeKeys, str_to_mode_keys -from smdebug.core.utils import match_inc +from smdebug.core.modes import ModeKeys +from smdebug.core.utils import match_inc, validate_custom_tensor_value from smdebug.tensorflow.callable_cache import CallableCache from smdebug.tensorflow.utils import InputOutputSaver, get_layer_call_fn @@ -108,14 +108,6 @@ def _is_not_supported(self): self._hook_supported = False return not self._hook_supported - def should_save_global_step_for_mode(self, mode: str): - # This function is called by the hook in the AWS TF codebase - # It returns a boolean value indicating to AWS TF if a step is - # Being saved for the step - mode = str_to_mode_keys(mode) - mode_step = self.mode_steps[mode] - return self.save_config.should_save_step(mode, mode_step) - def register_model(self, model): # This function is called by the hook in the AWS TF codebase # It attaches a hook to every layer of the model to capture @@ -395,9 +387,8 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): self.tensor_to_collections[metric_name] = {coll} def save_tensor(self, tensor_name, tensor_value, collections_to_write="default"): - if ( - not ((isinstance(tensor_value, tf.Tensor)) and hasattr(tensor_value, "numpy")) - ) or self._is_not_supported(): + if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: + self.logger.warn("The tensor value could not be converted into a numpy value") return if isinstance(collections_to_write, str): From 7ca294234bbc695702797867316bdd2542170b69 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 15:58:00 -0700 Subject: [PATCH 142/149] constants --- smdebug/tensorflow/keras.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 0110fefd6..4d05fe642 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -20,6 +20,7 @@ from .utils import ( ModelInput, ModelInputs, + ModelOutput, ModelOutputs, TFDistributionStrategy, get_export_name_for_keras, @@ -1002,7 +1003,7 @@ def save_tape_logs(self, model_inputs=None, outputs=None): :param outputs: :return: """ - logs = {"smdebug_y": outputs, "smdebug_x": model_inputs} + logs = {ModelOutput.Y: outputs, ModelInput.X: model_inputs} self.save_smdebug_logs(logs) def wrap_tape(self, tape): From 2df55e0e47f4895d76e3a6e524a60df52a9f7d28 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Mon, 27 Jul 2020 17:05:36 -0700 Subject: [PATCH 143/149] Implement Save Tensor For Mxnet and Pytorch (#291) --- smdebug/core/hook.py | 23 ++++++++++++++++ smdebug/mxnet/hook.py | 1 + smdebug/mxnet/utils.py | 2 ++ smdebug/pytorch/hook.py | 2 ++ smdebug/pytorch/utils.py | 2 ++ smdebug/tensorflow/constants.py | 1 + smdebug/tensorflow/keras.py | 16 +++++------ tests/mxnet/mnist_gluon_model.py | 9 ++++++ tests/mxnet/test_custom_tensor.py | 32 ++++++++++++++++++++++ tests/mxnet/test_hook_all_zero.py | 3 +- tests/pytorch/test_save_custom_tensor.py | 35 ++++++++++++++++++++++++ tests/pytorch/utils.py | 13 +++++++-- 12 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 tests/mxnet/test_custom_tensor.py create mode 100644 tests/pytorch/test_save_custom_tensor.py diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 7e70404da..1885ca09d 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -44,6 +44,7 @@ match_inc, remove_claim_file, size_and_shape, + validate_custom_tensor_value, ) from smdebug.core.writer import FileWriter from smdebug.exceptions import InvalidCollectionConfiguration @@ -880,6 +881,7 @@ def __init__( ) self.exported_collections = False self.data_type_name = data_type_name + self.custom_tensors_to_save = dict() def _cleanup(self): if not self.exported_collections: @@ -905,6 +907,23 @@ def _write(self, module_name, var, suffix, idx): ) return idx + def save_tensor(self, tensor_name, tensor_value, collections_to_write=CollectionKeys.DEFAULT): + if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: + self.logger.warn("The tensor value could not be converted into a numpy value") + return + if isinstance(collections_to_write, str): + collections_to_write = [collections_to_write] + for collection in collections_to_write: + self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) + + def _save_custom_tensors_post_step(self): + for tensor_name in self.custom_tensors_to_save: + tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] + c = self.collection_manager.get(collection_names, create=True) + c.add_tensor_name(tensor_name) + self._write_raw_tensor(tensor_name, tensor_value, [c]) + self.custom_tensors_to_save.clear() + def _write_inputs(self, name, inputs): tensor_name = name + CallbackHook.INPUT_TENSOR_SUFFIX idx = self.written_tensor_name_for_step.get(tensor_name, 0) @@ -922,3 +941,7 @@ def _write_outputs(self, name, outputs): @abstractmethod def _export_model(self): pass + + @staticmethod + def _make_numpy_array(tensor_value): + pass diff --git a/smdebug/mxnet/hook.py b/smdebug/mxnet/hook.py index cacdfb98c..7234fbf88 100644 --- a/smdebug/mxnet/hook.py +++ b/smdebug/mxnet/hook.py @@ -154,6 +154,7 @@ def forward_pre_hook(self, block, inputs): self.exported_collections = True self.last_block = block + self._save_custom_tensors_post_step() # This hook is invoked by trainer after running the forward pass. def forward_hook(self, block, inputs, outputs): diff --git a/smdebug/mxnet/utils.py b/smdebug/mxnet/utils.py index ab27fe1ad..aa228145e 100644 --- a/smdebug/mxnet/utils.py +++ b/smdebug/mxnet/utils.py @@ -54,6 +54,8 @@ def make_numpy_array(x): elif isinstance(x, tuple): # todo: fix this, will crash return np.asarray(x, dtype=x.dtype) + elif isinstance(x, list): + return np.asarray(x) else: raise TypeError( "_make_numpy_array only accepts input types of numpy.ndarray, scalar," diff --git a/smdebug/pytorch/hook.py b/smdebug/pytorch/hook.py index fe8834fda..c50debf8a 100644 --- a/smdebug/pytorch/hook.py +++ b/smdebug/pytorch/hook.py @@ -162,6 +162,7 @@ def forward_hook(self, module, inputs, outputs): # Output output tensors self._write_outputs(module_name, outputs) + self._save_custom_tensors_post_step() self.last_saved_step = self.step def backward_hook(self, tname): @@ -172,6 +173,7 @@ def back(grad): if grad is not None: # self.logger.debug(f"Processing the backward step " f"{self.step} for {tname}") self._save_for_tensor(self.GRADIENT_PREFIX + tname, grad) + self._save_custom_tensors_post_step() return back diff --git a/smdebug/pytorch/utils.py b/smdebug/pytorch/utils.py index 95359257c..ea0caf949 100644 --- a/smdebug/pytorch/utils.py +++ b/smdebug/pytorch/utils.py @@ -45,6 +45,8 @@ def make_numpy_array(x): return x.to(torch.device("cpu")).data.numpy() elif isinstance(x, tuple): return np.asarray(x, dtype=x.dtype) + elif isinstance(x, list): + return np.asarray(x) else: raise TypeError( "_make_numpy_array only accepts input types of numpy.ndarray, scalar," diff --git a/smdebug/tensorflow/constants.py b/smdebug/tensorflow/constants.py index c2de1677f..4e52a0114 100644 --- a/smdebug/tensorflow/constants.py +++ b/smdebug/tensorflow/constants.py @@ -1,2 +1,3 @@ SMDEBUG_GRADIENTS_KEY = "smdebug_gradients" SMDEBUG_LAYER_OUTPUTS_KEY = "smdebug_layer_outputs" +SMDEBUG_PREFIX = "smdebug_" diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4d05fe642..aa257ca17 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -15,7 +15,7 @@ # Local from .base_hook import TensorflowBaseHook from .collection import CollectionKeys -from .constants import SMDEBUG_GRADIENTS_KEY, SMDEBUG_LAYER_OUTPUTS_KEY +from .constants import SMDEBUG_GRADIENTS_KEY, SMDEBUG_LAYER_OUTPUTS_KEY, SMDEBUG_PREFIX from .tensor_ref import TensorRef, get_tf_names from .utils import ( ModelInput, @@ -391,7 +391,6 @@ def save_tensor(self, tensor_name, tensor_value, collections_to_write="default") if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: self.logger.warn("The tensor value could not be converted into a numpy value") return - if isinstance(collections_to_write, str): collections_to_write = [collections_to_write] @@ -403,11 +402,10 @@ def _save_custom_tensors_post_step(self): # that the user has saved with the save_tensor api for tensor_name in self.custom_tensors_to_save: tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] - self._save_tensor(tensor_name, tensor_value, collection_names) - # Clear saved custom tensors + self._save_tensor_to_file(tensor_name, tensor_value, collection_names) self.custom_tensors_to_save.clear() - def _save_tensor(self, tensor_name, tensor_value, collections): + def _save_tensor_to_file(self, tensor_name, tensor_value, collections): if isinstance(collections, set) is False: collections = {collections} # Since this function modifies the set, there is a possibility @@ -442,7 +440,7 @@ def save_smdebug_logs(self, logs): for key in logs: tensors_to_save = [] collections_to_write = set() - if "smdebug_" in key: + if SMDEBUG_PREFIX in key: # Save Model Outputs if key in ModelOutputs: export_name = get_model_output_export_name(key) @@ -520,7 +518,7 @@ def _save_layer_input_and_outputs(self, grad_tape=False): if self._is_collection_being_saved_for_step(CollectionKeys.LAYERS) else set() ) - self._save_tensor(export_name, tensor.numpy(), input_collection) + self._save_tensor_to_file(export_name, tensor.numpy(), input_collection) # Save Output tensor = self.saved_layers[layer_name].layer_output export_name = get_export_name_for_keras(layer_name, tensor_type="output", tensor=tensor) @@ -530,7 +528,7 @@ def _save_layer_input_and_outputs(self, grad_tape=False): if self._is_collection_being_saved_for_step(CollectionKeys.LAYERS) else set() ) - self._save_tensor(export_name, tensor.numpy(), output_collection) + self._save_tensor_to_file(export_name, tensor.numpy(), output_collection) def _save_tensors_post_step(self, batch, logs): # some tensors available as value from within hook are saved here @@ -733,7 +731,7 @@ def _save_layer_values(self, layer_outputs, collection, model=None, inputs=None) export_name = get_export_name_for_keras(l.name, tensor_suffix) tensors_to_save.append((export_name, o)) for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + self._save_tensor_to_file(t_name, t_value, collections_to_write) def save_layer_outputs(self, layer_outputs, model=None): self._save_layer_values(layer_outputs, self.get_collection(CollectionKeys.LAYERS), model) diff --git a/tests/mxnet/mnist_gluon_model.py b/tests/mxnet/mnist_gluon_model.py index daeba9b6e..028f0da1a 100644 --- a/tests/mxnet/mnist_gluon_model.py +++ b/tests/mxnet/mnist_gluon_model.py @@ -28,6 +28,7 @@ def run_mnist_gluon_model( make_input_zero=False, normalize_mean=0.13, normalize_std=0.31, + save_custom_tensor=False, ): batch_size = 4 if make_input_zero: @@ -103,6 +104,8 @@ def run_mnist_gluon_model( eval_acc_name = "loss_acc" # Start the training. + if save_custom_tensor: + hook.save_tensor("custom_tensor_1", mx.nd.array([1, 2, 3])) for epoch in range(1): train_loss, train_acc, valid_acc = 0.0, 0.0, 0.0 tic = time.time() @@ -111,6 +114,8 @@ def run_mnist_gluon_model( i = 0 for data, label in train_data: + if save_custom_tensor: + hook.save_tensor("custom_tensor_2", mx.nd.array([1, 2, 3])) data = data.as_in_context(mx.cpu(0)) # forward + backward with autograd.record(): @@ -124,6 +129,10 @@ def run_mnist_gluon_model( train_acc += acc(output, label) # hook.save_scalar(train_loss_name, train_loss) # hook.save_scalar(train_acc_name, train_acc) + if save_custom_tensor: + # This tensor will not be added to default collections since + # collections have already been exported + hook.save_tensor("custom_tensor_3", mx.nd.array([1, 2, 3])) i += 1 if num_steps_train is not None and i >= num_steps_train: break diff --git a/tests/mxnet/test_custom_tensor.py b/tests/mxnet/test_custom_tensor.py new file mode 100644 index 000000000..0c87f8331 --- /dev/null +++ b/tests/mxnet/test_custom_tensor.py @@ -0,0 +1,32 @@ +# Standard Library +import shutil +from datetime import datetime + +# First Party +from smdebug import SaveConfig +from smdebug.core.collection import CollectionKeys +from smdebug.mxnet.hook import Hook as t_hook +from smdebug.trials import create_trial + +# Local +from .mnist_gluon_model import run_mnist_gluon_model + + +def test_hook(): + save_config = SaveConfig(save_steps=[0, 1, 2, 3]) + run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") + out_dir = "/tmp/newlogsRunTest/" + run_id + hook = t_hook(out_dir=out_dir, save_config=save_config) + run_mnist_gluon_model( + hook=hook, + num_steps_train=10, + num_steps_eval=10, + register_to_loss_block=True, + save_custom_tensor=True, + ) + trial = create_trial(out_dir) + custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT) + all_tensors = trial.tensor_names() + assert len(custom_tensors) == 2 + assert len(all_tensors) == 4 + shutil.rmtree(out_dir) diff --git a/tests/mxnet/test_hook_all_zero.py b/tests/mxnet/test_hook_all_zero.py index 1d6c0b00a..c9c693d80 100644 --- a/tests/mxnet/test_hook_all_zero.py +++ b/tests/mxnet/test_hook_all_zero.py @@ -36,8 +36,7 @@ def test_hook_all_zero(hook=None, out_dir=None): assert tr assert len(tr.steps()) == 4 - tnames = tr.tensor_names(regex="conv._input") - tname = tr.tensor_names(regex="conv._input")[0] + tname = tr.tensor_names(regex="conv.+_input")[0] conv_tensor_value = tr.tensor(tname).value(step_num=0) is_zero = np.all(conv_tensor_value == 0) assert is_zero == True diff --git a/tests/pytorch/test_save_custom_tensor.py b/tests/pytorch/test_save_custom_tensor.py new file mode 100644 index 000000000..c8713ac87 --- /dev/null +++ b/tests/pytorch/test_save_custom_tensor.py @@ -0,0 +1,35 @@ +# Standard Library +import shutil +from datetime import datetime + +# Third Party +import torch +import torch.optim as optim + +# First Party +from smdebug.core.collection import CollectionKeys +from smdebug.pytorch import SaveConfig +from smdebug.pytorch.hook import Hook as t_hook +from smdebug.trials import create_trial + +# Local +from .utils import Net, train + + +def test_hook(): + run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") + out_dir = "/tmp/" + run_id + hook = t_hook( + out_dir=out_dir, + save_config=SaveConfig(save_steps=[0, 1, 2, 3]), + include_collections=["relu_activations"], + ) + + model = Net().to(torch.device("cpu")) + hook.register_module(model) + optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) + train(model, hook, torch.device("cpu"), optimizer, num_steps=10, save_custom_tensor=True) + trial = create_trial(out_dir) + custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT) + assert len(custom_tensors) == 4 + shutil.rmtree(out_dir) diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py index 5ac1d399b..45978cfd1 100644 --- a/tests/pytorch/utils.py +++ b/tests/pytorch/utils.py @@ -35,12 +35,19 @@ def forward(self, x): return F.log_softmax(x, dim=1) -def train(model, hook, device, optimizer, num_steps=500, set_modes=False): +def train(model, hook, device, optimizer, num_steps=500, set_modes=False, save_custom_tensor=False): + if save_custom_tensor: + hook.save_tensor("custom_tensor_0", torch.tensor([[1.0, -1.0], [1.0, -1.0]])) + if set_modes: hook.set_mode(modes.TRAIN) + if save_custom_tensor: + hook.save_tensor("custom_tensor_1", torch.tensor([[1.0, -1.0], [1.0, -1.0]])) + model.train() - # for batch_idx, (data, target) in enumerate(train_loader): + if save_custom_tensor: + hook.save_tensor("custom_tensor_2", torch.tensor([[1.0, -1.0], [1.0, -1.0]])) for i in range(num_steps): batch_size = 32 data, target = torch.rand(batch_size, 1, 28, 28), torch.rand(batch_size).long() @@ -49,6 +56,8 @@ def train(model, hook, device, optimizer, num_steps=500, set_modes=False): output = model(Variable(data, requires_grad=True)) loss = F.nll_loss(output, target) hook.record_tensor_value("nll_loss", tensor_value=loss) + if save_custom_tensor: + hook.save_tensor("custom_tensor_3", torch.tensor([[1.0, -1.0], [1.0, -1.0]])) loss.backward() optimizer.step() From 067e72487499988fa33acdb2b871db7ae8e4bd3a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 17:08:06 -0700 Subject: [PATCH 144/149] parameterize test keras fit --- tests/tensorflow2/test_keras.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 20038f139..462fdb555 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -397,8 +397,9 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow +@pytest.mark.parametrize("run_eagerly", [True, False]) @pytest.mark.parametrize("saveall", [True, False]) -def test_keras_fit(out_dir, tf_eager_mode, saveall): +def test_keras_fit(out_dir, tf_eager_mode, run_eagerly, saveall): hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) @@ -407,7 +408,8 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): helper_keras_fit( trial_dir=out_dir, hook=hook, - run_eagerly=tf_eager_mode, + run_eagerly=run_eagerly, + eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) From 49550e8cd240bc26c2ed3baa4cc7da637b1f7ef4 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 19:11:45 -0700 Subject: [PATCH 145/149] tf eager --- tests/tensorflow2/test_keras.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 462fdb555..3359826fb 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -397,9 +397,8 @@ def test_gradtape_persistent(out_dir, saveall): @pytest.mark.slow -@pytest.mark.parametrize("run_eagerly", [True, False]) @pytest.mark.parametrize("saveall", [True, False]) -def test_keras_fit(out_dir, tf_eager_mode, run_eagerly, saveall): +def test_keras_fit(out_dir, tf_eager_mode, saveall): hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) @@ -408,7 +407,6 @@ def test_keras_fit(out_dir, tf_eager_mode, run_eagerly, saveall): helper_keras_fit( trial_dir=out_dir, hook=hook, - run_eagerly=run_eagerly, eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) @@ -417,7 +415,7 @@ def test_keras_fit(out_dir, tf_eager_mode, run_eagerly, saveall): # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: - assert len(trial.tensor_names()) == (21 if is_tf_2_2() else 22) + assert len(trial.tensor_names()) == (13 if is_tf_2_2() else 14) assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == 0 assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == 0 else: From cb44a7dbcc3b4549398ba788054b5a3d1d041f6a Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 20:24:42 -0700 Subject: [PATCH 146/149] nit --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index aa257ca17..9da4ca092 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -480,7 +480,7 @@ def save_smdebug_logs(self, logs): else set() ) for t_name, t_value in tensors_to_save: - self._save_tensor(t_name, t_value, collections_to_write) + self._save_tensor_to_file(t_name, t_value, collections_to_write) def _save_metrics(self, batch, logs, force_save=False): # if force_save is True, doesn't check whether collection needs to be saved for steps From b67fa45b4765b510d70fb6c0a5265461bed4efe2 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 21:19:17 -0700 Subject: [PATCH 147/149] nit and remove duped fn --- smdebug/tensorflow/keras.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 9da4ca092..f213e6dd3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -8,7 +8,7 @@ # First Party from smdebug.core.modes import ModeKeys -from smdebug.core.utils import match_inc, validate_custom_tensor_value +from smdebug.core.utils import match_inc from smdebug.tensorflow.callable_cache import CallableCache from smdebug.tensorflow.utils import InputOutputSaver, get_layer_call_fn @@ -387,16 +387,6 @@ def _add_metric(self, metric_name, metric_value: tf.Tensor = None): coll.set_tensor_ref(TensorRef.from_non_graph_var(metric_name)) self.tensor_to_collections[metric_name] = {coll} - def save_tensor(self, tensor_name, tensor_value, collections_to_write="default"): - if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: - self.logger.warn("The tensor value could not be converted into a numpy value") - return - if isinstance(collections_to_write, str): - collections_to_write = [collections_to_write] - - for collection in collections_to_write: - self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - def _save_custom_tensors_post_step(self): # This saves all the values of custom tensors # that the user has saved with the save_tensor api From 075b2a0a0c142676ef864c2f76517d84b37ecbb8 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 22:19:52 -0700 Subject: [PATCH 148/149] refactor --- smdebug/core/hook.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 1885ca09d..10577fed6 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -240,6 +240,7 @@ def __init__( # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state() + self.custom_tensors_to_save = dict() # This will avoid pickling of BaseHook object def __getstate__(self): @@ -536,6 +537,23 @@ def _write_state(self): current_state[LATEST_MODE_STEP] = mode_step self.state_store.update_state(current_state) + def save_tensor(self, tensor_name, tensor_value, collections_to_write=CollectionKeys.DEFAULT): + if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: + self.logger.warn("The tensor value could not be converted into a numpy value") + return + if isinstance(collections_to_write, str): + collections_to_write = [collections_to_write] + for collection in collections_to_write: + self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) + + def _save_custom_tensors_post_step(self): + for tensor_name in self.custom_tensors_to_save: + tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] + c = self.collection_manager.get(collection_names, create=True) + c.add_tensor_name(tensor_name) + self._write_raw_tensor(tensor_name, tensor_value, [c]) + self.custom_tensors_to_save.clear() + def set_mode(self, mode): # train if mode in ALLOWED_MODES: @@ -881,7 +899,6 @@ def __init__( ) self.exported_collections = False self.data_type_name = data_type_name - self.custom_tensors_to_save = dict() def _cleanup(self): if not self.exported_collections: @@ -907,23 +924,6 @@ def _write(self, module_name, var, suffix, idx): ) return idx - def save_tensor(self, tensor_name, tensor_value, collections_to_write=CollectionKeys.DEFAULT): - if validate_custom_tensor_value(tensor_value, self._make_numpy_array) is False: - self.logger.warn("The tensor value could not be converted into a numpy value") - return - if isinstance(collections_to_write, str): - collections_to_write = [collections_to_write] - for collection in collections_to_write: - self.custom_tensors_to_save[tensor_name] = (tensor_value, collection) - - def _save_custom_tensors_post_step(self): - for tensor_name in self.custom_tensors_to_save: - tensor_value, collection_names = self.custom_tensors_to_save[tensor_name] - c = self.collection_manager.get(collection_names, create=True) - c.add_tensor_name(tensor_name) - self._write_raw_tensor(tensor_name, tensor_value, [c]) - self.custom_tensors_to_save.clear() - def _write_inputs(self, name, inputs): tensor_name = name + CallbackHook.INPUT_TENSOR_SUFFIX idx = self.written_tensor_name_for_step.get(tensor_name, 0) From 1a1838e0e927fc06a522f719cd97f8cacf5717c9 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Mon, 27 Jul 2020 23:17:22 -0700 Subject: [PATCH 149/149] retrigger CI