Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
70b927d
default config
NihalHarish Aug 18, 2020
63ee466
add test
NihalHarish Aug 18, 2020
171522a
add mxnet and pytorch tests
NihalHarish Aug 18, 2020
1865f27
differentiate between tf and other frameworks
NihalHarish Aug 19, 2020
82f2bd6
should save layer
NihalHarish Aug 19, 2020
5762c53
handle kernelGrad
NihalHarish Aug 19, 2020
e83e126
init
NihalHarish Aug 19, 2020
63a9cad
run only for grad tape
NihalHarish Aug 19, 2020
230177b
str cast
NihalHarish Aug 19, 2020
bef5e61
scan
NihalHarish Aug 19, 2020
5204cf2
fix
NihalHarish Aug 20, 2020
b878e5f
update
NihalHarish Aug 20, 2020
744e627
limit gradient loop
NihalHarish Aug 20, 2020
c6765ca
modify assert
NihalHarish Aug 20, 2020
ade21de
Merge remote-tracking branch 'origin' into default_hook_config_gamma
NihalHarish Aug 20, 2020
dc0b0f1
logs is None
NihalHarish Aug 20, 2020
1191fa5
PR comments
NihalHarish Aug 21, 2020
85e69e5
save tensor in custom collection
NihalHarish Aug 21, 2020
6808cb7
register hook only for gradtape
NihalHarish Aug 23, 2020
bed51bc
attach layer hook only for grad tape
NihalHarish Aug 23, 2020
800b65c
register model
NihalHarish Aug 23, 2020
683c758
make tf log compliant
NihalHarish Aug 23, 2020
aeb2ba1
update tf2_2
NihalHarish Aug 23, 2020
311b131
unpack input
NihalHarish Aug 24, 2020
777567d
nit: correct comments
NihalHarish Aug 24, 2020
fdd537c
retrigger CI
NihalHarish Aug 24, 2020
469c5dd
retrigger CI
NihalHarish Aug 24, 2020
9f25d63
revert assert change
NihalHarish Aug 24, 2020
6c4ec8f
save gradients
NihalHarish Aug 24, 2020
03e6d26
address ordering bug
NihalHarish Aug 26, 2020
0613b32
retrigger CI
NihalHarish Aug 26, 2020
bdb35bd
refactor
NihalHarish Aug 26, 2020
6f570b4
merge origin
NihalHarish Aug 26, 2020
b3a3ccd
Revert "Add ability to only save shapes of tensors (#328)"
NihalHarish Aug 26, 2020
5ff1266
rename fns
NihalHarish Aug 27, 2020
0bbe25b
gradients refactor
NihalHarish Aug 28, 2020
7c5312c
revert pr 328-shapes
NihalHarish Aug 28, 2020
c0682da
test should_save_tensor
NihalHarish Aug 28, 2020
02a026c
reduce code dupe
NihalHarish Aug 28, 2020
368feb4
retrigger CI
NihalHarish Aug 28, 2020
a49a406
loop over tuple
NihalHarish Aug 28, 2020
7d8bb23
merge master
NihalHarish Aug 28, 2020
4de3e63
extract v name
NihalHarish Aug 28, 2020
4f7efae
extract v name
NihalHarish Aug 28, 2020
6861bc0
fix bug in zcc test
NihalHarish Aug 29, 2020
9d9359b
comments
NihalHarish Aug 30, 2020
b4b6924
rename fn
NihalHarish Aug 30, 2020
c42d207
retrigger CI
NihalHarish Aug 31, 2020
03e2922
retrigger CI
NihalHarish Aug 31, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions smdebug/core/config_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,5 @@

CALLABLE_CACHE_ENV_VAR = "SMDEBUG_KERAS_CALLABLE_CACHE_TYPE"
DEFAULT_CALLABLE_CACHE = "CACHE_PER_MODE"

DEFAULT_SAVED_COLLECTIONS = ["losses"]
23 changes: 23 additions & 0 deletions smdebug/core/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from smdebug.core.collection_manager import CollectionManager
from smdebug.core.config_constants import (
DEFAULT_SAVED_COLLECTIONS,
DEFAULT_WORKER_NAME,
LATEST_GLOBAL_STEP_SAVED,
LATEST_GLOBAL_STEP_SEEN,
Expand Down Expand Up @@ -343,6 +344,13 @@ def _get_collections_to_save_for_step(self) -> Set["Collection"]:
)
return self._collections_to_save_for_step

def is_tensor_saved_for_step(self, tensor_name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should_save_tensor_for_step ?

collections_to_save = self._get_collections_to_save_for_step()
for c in collections_to_save:
if match_inc(tensor_name, c.include_regex):
return True
return False

def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]:
self._assert_prep()
# for tf this will be prepopulated in check_and_add_tensor
Expand All @@ -364,6 +372,14 @@ def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]:
def _get_default_collections(self):
pass

def has_default_hook_configuration(self):
# Used in the internal framework forks to determine if the hook
# is using the default hook configuration
collections_being_saved = [x.name for x in self._collections_to_save]
if set(collections_being_saved) == set(DEFAULT_SAVED_COLLECTIONS):
return True
return False

def _prepare_collections(self):
"""Populate collections_to_save and ensure every collection has
a save_config and reduction_config."""
Expand Down Expand Up @@ -525,6 +541,13 @@ def _increment_step(self):
self.mode_steps[ModeKeys.GLOBAL] = self.step
self._collections_to_save_for_step = None

# Called in the internal AWS codebase to determine
# if a particular tensor value should be saved
def should_save_tensor_or_collection(self, tensor_name: str, collection_name: str) -> bool:
if self._is_collection_being_saved_for_step(collection_name):
return True
return self.is_tensor_saved_for_step(tensor_name)

def _write_state(self):
if self.state_store.is_checkpoint_updated():
current_state = dict()
Expand Down
9 changes: 9 additions & 0 deletions smdebug/tensorflow/base_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

# Local
from .collection import CollectionKeys, CollectionManager
from .constants import TF_DEFAULT_SAVED_COLLECTIONS
from .singleton_utils import set_hook
from .utils import (
TFDistributionStrategy,
Expand Down Expand Up @@ -217,6 +218,14 @@ def export_collections(self):
collection_file_name = f"{self.worker}_collections.json"
self.collection_manager.export(self.out_dir, collection_file_name)

def has_default_hook_configuration(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this function is repeated again here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have overridden the fn in core hook to utilize a constant defined for TF.

# Used in AWS TF to determine if the hook
# is using the default hook configuration
collections_being_saved = [x.name for x in self._collections_to_save]
if set(collections_being_saved) == set(TF_DEFAULT_SAVED_COLLECTIONS):
return True
return False

def _get_custom_and_default_collections(self) -> Tuple[Set["Collection"], Set["Collection"]]:
if self._custom_collections is None:
self._custom_collections = set()
Expand Down
2 changes: 2 additions & 0 deletions smdebug/tensorflow/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
SMDEBUG_GRADIENTS_KEY = "smdebug_gradients"
SMDEBUG_LAYER_OUTPUTS_KEY = "smdebug_layer_outputs"
SMDEBUG_PREFIX = "smdebug_"

TF_DEFAULT_SAVED_COLLECTIONS = ["losses", "metrics", "sm_metrics"]
135 changes: 82 additions & 53 deletions smdebug/tensorflow/keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_model_input_export_name,
get_model_output_export_name,
is_keras_optimizer,
is_tf_version_2_3_x,
is_tf_version_2x,
)

Expand Down Expand Up @@ -71,6 +72,14 @@ def __init__(
) # stores tensors custom tensors saved by users every step
self.saved_layers = dict()
self.has_registered_model = False
# supports_tf_logs property was introduced in TF 2.3.0
# it indicates to the framework that the callback is not
# limited to reading only numpy logs
self._supports_tf_logs = True
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment.

# TF 2.3.0 has a callback ordering bug
# this flag indicated to the train_batch_begin callback
# the the step was already incremented in the on_train_begin callback
self.step_incremented_in_on_train_begin = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment about what is this.


def _is_not_supported(self):
if self.distribution_strategy is None:
Expand Down Expand Up @@ -109,7 +118,8 @@ def register_model(self, model):
# It attaches a hook to every layer of the model to capture
# layer values
self.model = model
self._wrap_model_with_input_output_saver()
if self.tape is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The wrap model isn't needed with the new changes on TF to capture layer inputs and outputs when we use .fit

self._wrap_model_with_input_output_saver()
self.has_registered_model = True

def _get_matching_collections(
Expand Down Expand Up @@ -348,7 +358,10 @@ def _prepare_tensors_available_post_step(self):

# Add tensor to custom collections
for custom_coll in custom_collections:
if match_inc(tensor_ref.name, custom_coll.include_regex):
if (
match_inc(tensor_ref.name, custom_coll.include_regex)
and tensor_ref.tf_obj is not None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this check? was there an issue without it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tensor_ref.tf_obj is none for certain tensors (kernelGrad) and ends up throwing a warning

):
custom_coll.add_for_mode(tensor_ref.tf_obj, self.mode)
if custom_coll not in self.tensor_to_collections[tensor_ref.name]:
self.tensor_to_collections[tensor_ref.name].add(custom_coll)
Expand Down Expand Up @@ -390,6 +403,12 @@ def _save_custom_tensors_post_step(self):
self._save_tensor_to_file(tensor_name, tensor_value, collection_names)
self.custom_tensors_to_save.clear()

def should_save_layer(self, layer_name):
# Called in AWS TF to determine
# if a particular layer value
# should be saved
return self.should_save_tensor_or_collection(layer_name, CollectionKeys.LAYERS)

def _save_tensor_to_file(self, tensor_name, tensor_value, collections):
if isinstance(collections, set) is False:
collections = {collections}
Expand Down Expand Up @@ -418,6 +437,31 @@ def _save_tensor_to_file(self, tensor_name, tensor_value, collections):
collection.set_tensor_ref(tensor_ref)
self._save_for_tensor(tensor_name, t, check_before_write=True)

def save_gradients_from_logs(self, gradients):
if gradients is not None:
gradient_collection = self.get_collection(CollectionKeys.GRADIENTS)
step_collections = self._get_collections_to_save_for_step()
collections_to_write = (
{gradient_collection} if gradient_collection in step_collections else set()
)
if gradients and isinstance(gradients[0], tuple) is False:
gradients = zip(self.model.trainable_variables, gradients)
for v, g in gradients:
if isinstance(v, tf.Tensor):
# Tensor.name is meaningless with eager execution
layer_name = str(v.numpy(), "utf-8")
elif isinstance(v, tf.Variable):
layer_name = v.name
else:
layer_name = v
layer_name = layer_name.split(":")[0]
export_name = "gradients/" + layer_name + "Grad"
if isinstance(g, IndexedSlices):
# This class is a simple wrapper for a pair of Tensor objects
# See: https://www.tensorflow.org/api_docs/python/tf/IndexedSlices
g = g.values
self._save_tensor_to_file(export_name, g, collections_to_write)

def save_smdebug_logs(self, logs):
if logs is None:
return
Expand All @@ -437,24 +481,10 @@ def save_smdebug_logs(self, logs):
)
# Save Gradients
elif key == SMDEBUG_GRADIENTS_KEY:
gradients = logs[key]
if gradients is not None:
for g, v in zip(gradients, self.model.trainable_variables):
layer_name = v.name
if len(layer_name.split(":")) > 1:
layer_name = layer_name.split(":")[0]
export_name = "gradients/" + layer_name + "Grad"
if isinstance(g, IndexedSlices):
# This class is a simple wrapper for a pair of Tensor objects
# See: https://www.tensorflow.org/api_docs/python/tf/IndexedSlices
g = g.values
tensors_to_save.append((export_name, g))
collections_to_write = {self.get_collection(CollectionKeys.GRADIENTS)}
self.save_gradients_from_logs(logs[key])
# Save Intermediate Layers
elif key == SMDEBUG_LAYER_OUTPUTS_KEY:
layer_outputs = logs[key]
self.save_layer_outputs(layer_outputs)
self.save_layer_inputs(logs[ModelInput.INPUTS], layer_outputs)
self._save_layer_values(logs[key])
# Save Model Inputs
elif key in ModelInputs:
export_name = get_model_input_export_name()
Expand Down Expand Up @@ -489,10 +519,9 @@ def _save_metrics(self, batch, logs, force_save=False):
self._add_metric(metric_name=key)
self._save_for_tensor(key, logs[key], check_before_write=False)

def _save_layer_input_and_outputs(self, grad_tape=False):
# Iterates over all the saved layers for input and output values
if is_tf_version_2x() is False or (grad_tape is False and self.model.run_eagerly is False):
# This function only works when the run_eagerly is True
def _save_layer_input_and_outputs(self):
# Run only for GradTape
if self.tape is None:
return
for layer_name in self.saved_layers:
# Save Input
Expand Down Expand Up @@ -520,7 +549,6 @@ def _save_tensors_post_step(self, batch, logs):
# weights, metrics
self._save_metrics(batch, logs)
self.save_smdebug_logs(logs)
self._save_layer_input_and_outputs()
self._save_custom_tensors_post_step()

if is_tf_version_2x() and tf.executing_eagerly():
Expand Down Expand Up @@ -615,6 +643,13 @@ def _on_any_mode_begin(self, mode):
self.graph = tf.get_default_graph()
self.set_mode(mode)

if self.prepared_collections is False and is_tf_version_2_3_x():
# Addresses ordering issues in TF 2.3.0
# sets prepared_collections to True here
self._prepare_collections()
self._increment_step()
self.step_incremented_in_on_train_begin = True

# have to clear callable cache if we are not caching per mode
self.callable_cache.change_mode()

Expand Down Expand Up @@ -658,7 +693,12 @@ def _on_any_batch_begin(self, batch, mode, logs=None):
# Write the gradients of the past step if the writer is still available.
if self.writer is not None or len(self.writer_map):
self._close_writers()
self._increment_step()

# Addresses callback ordering bug in TF 2.3.0
if self.step_incremented_in_on_train_begin is False:
self._increment_step()
else:
self.step_incremented_in_on_train_begin = False

if self.prepared_collections is False:
# sets prepared_collections to True here
Expand All @@ -668,7 +708,6 @@ def _on_any_batch_begin(self, batch, mode, logs=None):
if (is_tf_version_2x() and tf.executing_eagerly()) or self._validate_exec_function(
self._get_exec_function(mode)
):
self._wrap_model_with_input_output_saver()
self._prepare_layers(mode)
self._prepare_tensors_available_post_step()
self._prepared_tensors[mode] = True
Expand Down Expand Up @@ -698,33 +737,23 @@ def on_test_batch_begin(self, batch, logs=None):
def on_predict_batch_begin(self, batch, logs=None):
self._on_any_batch_begin(batch, ModeKeys.PREDICT, logs=logs)

def _save_layer_values(self, layer_outputs, collection, model=None, inputs=None):
if model is None:
if self.model:
model = self.model
else:
return
if layer_outputs is not None:
tensors_to_save = []
step_collections = self._get_collections_to_save_for_step()
collections_to_write = {collection} if collection in step_collections else set()
tensor_suffix = "output"
if inputs is not None:
layer_outputs = [inputs] + layer_outputs
tensor_suffix = "input"
for o, l in zip(layer_outputs, model.layers):
export_name = get_export_name_for_keras(l.name, tensor_suffix)
tensors_to_save.append((export_name, o))
for t_name, t_value in tensors_to_save:
self._save_tensor_to_file(t_name, t_value, collections_to_write)

def save_layer_outputs(self, layer_outputs, model=None):
self._save_layer_values(layer_outputs, self.get_collection(CollectionKeys.LAYERS), model)

def save_layer_inputs(self, x, layer_outputs, model=None):
self._save_layer_values(
layer_outputs, self.get_collection(CollectionKeys.LAYERS), model, inputs=x
)
def _save_layer_values(self, logs):
if logs is None:
return
step_collections = self._get_collections_to_save_for_step()
layer_collection = self.get_collection(CollectionKeys.LAYERS)
collections_to_write = {layer_collection} if layer_collection in step_collections else set()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will we be looping through logs even if collections_to_write = set() ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this "if" equal to checking

if should_save_layer():
    collections_to_write = {self.get_collection(CollectionKeys.LAYERS)}

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to loop though logs if collections_to_write={} because:

  • layers are passed to smdebug only if the user has requested for it with the LAYERS collection or regex.
  • if CollectionKeys.LAYERS is not present then we must still save since it might be present in a regex based collection

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which collection will save_tensor_to_file save to if collections_to_Write = {} ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll be saving to custom collections with a matching regex

for layer_name, layer_input, layer_output in logs:
# Cast layer_name to str since it can also be of type bytes
# when run with mirrored strategy
if len(layer_input) == 1:
# Layer Inputs are flattened and passed as a list into
# the next layer. Unpacking it speeds up the _make_numpy fn.
layer_input = layer_input[0]
layer_input_tensor_name = get_export_name_for_keras(str(layer_name), "input")
self._save_tensor_to_file(layer_input_tensor_name, layer_input, collections_to_write)
layer_output_tensor_name = get_export_name_for_keras(str(layer_name), "output")
self._save_tensor_to_file(layer_output_tensor_name, layer_output, collections_to_write)

def _write_optimizer_variables(self):
optimizer_collections = self.collection_manager.get(CollectionKeys.OPTIMIZER_VARIABLES)
Expand Down Expand Up @@ -951,7 +980,7 @@ def run(*args, **kwargs):
)

self._write_optimizer_variables()
self._save_layer_input_and_outputs(grad_tape=True)
self._save_layer_input_and_outputs()
if not ((isinstance(loss, tf.Tensor)) and hasattr(loss, "numpy")):
return grads
self._add_metric(metric_name="loss", metric_value=loss)
Expand Down
4 changes: 4 additions & 0 deletions smdebug/tensorflow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,3 +384,7 @@ def get_keras_mode(mode):

def is_tf_version_2x():
return version.parse(tf.__version__) >= version.parse("2.0.0")


def is_tf_version_2_3_x():
return version.parse(tf.__version__) >= version.parse("2.3.0")
6 changes: 1 addition & 5 deletions tests/tensorflow2/test_keras.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,11 +520,7 @@ def test_include_regex(out_dir, tf_eager_mode):

tr = create_trial_fast_refresh(out_dir)
tnames = tr.tensor_names(collection="custom_coll")

if tf_eager_mode:
assert len(tnames) == (12 if is_tf_2_2() else 8)
else:
assert len(tnames) == 8
assert len(tnames) == 12
for tname in tnames:
assert tr.tensor(tname).value(0) is not None

Expand Down
Loading