From ac3c25016896cc934394cf3cff38032d7cb70028 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 12:42:35 -0700 Subject: [PATCH 01/40] WIP saveshape --- smdebug/core/hook.py | 23 +++++++++++++++++++++++ smdebug/core/reduction_config.py | 18 +++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 10577fed6..e243f6149 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -343,6 +343,12 @@ def _get_collections_to_save_for_step(self) -> Set["Collection"]: ) return self._collections_to_save_for_step + def _saving_shapes_in_step(self) -> bool: + for coll in self._get_collections_to_save_for_step(): + if coll.reduction_config.save_shape is True: + return True + return False + def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]: self._assert_prep() # for tf this will be prepopulated in check_and_add_tensor @@ -456,6 +462,9 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: return self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) + if self._saving_shapes_in_step(): + self.shape_writer = + def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ :param tensor_name: @@ -725,6 +734,16 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_ if reduction_config.save_raw_tensor is True: self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref) break + + def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None): + for s_col in save_collections: + reduction_config = s_col.reduction_config + if reduction_config.save_shape is True: + numpy_tensor_value = self._make_numpy_array(tensor_value) + this_size, this_shape = size_and_shape(numpy_tensor_value) + + self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref) + break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): # tensor_ref is used by TF @@ -742,6 +761,7 @@ def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, t timestamp=timestamp, ) + def _save_for_tensor(self, tensor_name, tensor_value, check_before_write=True): """ Identifies if this tensor should be saved for this step @@ -805,6 +825,9 @@ def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ :param save_collections: list of collections which are being saved for this step """ self._log_save(tensor_name, save_collections) + + self._write_shape(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref) + # write reductions defined for collections this tensor may be part of self._write_reductions(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref) diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py index 9a24ff6d6..6ad78cdee 100644 --- a/smdebug/core/reduction_config.py +++ b/smdebug/core/reduction_config.py @@ -1,6 +1,8 @@ # Standard Library import json from typing import Any, Dict +from smdebug.core.logger import get_logger +logger = get_logger() # First Party from smdebug.core.utils import split @@ -8,7 +10,7 @@ ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"] ALLOWED_NORMS = ["l1", "l2"] REDUCTION_CONFIG_VERSION_NUM = "v0" -ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor"] +ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor", "save_shape"] class ReductionConfig: @@ -49,12 +51,14 @@ def __init__( norms=None, abs_norms=None, save_raw_tensor=False, + save_shape=False, ): self.reductions = reductions if reductions is not None else [] self.abs_reductions = abs_reductions if abs_reductions is not None else [] self.norms = norms if norms is not None else [] self.abs_norms = abs_norms if abs_norms is not None else [] self.save_raw_tensor = save_raw_tensor + self.save_shape = save_shape ## DO NOT REMOVE, if you add anything here, please make sure that _check & from_json is updated accordingly self._check() @@ -75,6 +79,9 @@ def _check(self): raise ValueError("abs_norms can only be one of " + ",".join(ALLOWED_NORMS)) if not isinstance(self.save_raw_tensor, bool): raise ValueError(f"save_raw_tensor={self.save_raw_tensor} must be a boolean") + if not isinstance(self.save_shape, bool): + raise ValueError(f"save_shape={self.save_shape} must be a boolean") + @classmethod def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig": @@ -83,7 +90,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig": return None if not isinstance(params, dict): raise ValueError(f"params={params} must be dict") - + save_shape = params.get("save_shape", False) save_raw_tensor = params.get("save_raw_tensor", False) # Parse comma-separated string into array all_reductions = split(params.get("reductions", "")) @@ -108,6 +115,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig": norms=norms, abs_norms=abs_norms, save_raw_tensor=save_raw_tensor, + save_shape=save_shape ) @classmethod @@ -116,7 +124,6 @@ def from_json(cls, json_str: str) -> "ReductionConfig": return cls.from_dict(d) def to_json_dict(self) -> Dict[str, Any]: - save_raw_tensor = self.save_raw_tensor # Convert reductions from various arrays into single comma-separated string all_reductions = [] for red in self.reductions: @@ -129,7 +136,7 @@ def to_json_dict(self) -> Dict[str, Any]: all_reductions.append(f"abs_{red}_norm") all_reductions_str = ",".join(all_reductions) # Return the dict - return {"save_raw_tensor": save_raw_tensor, "reductions": all_reductions_str} + return {"save_raw_tensor": self.save_raw_tensor, "reductions": all_reductions_str, "save_shape": self.save_shape} def to_json(self) -> str: return json.dumps(self.to_json_dict()) @@ -144,10 +151,11 @@ def __eq__(self, other): and self.norms == other.norms and self.abs_norms == other.abs_norms and self.save_raw_tensor == other.save_raw_tensor + and self.save_shape == other.save_shape ) def __repr__(self): return ( f"" + f"abs_reductions={self.abs_reductions}, norms={self.norms}, abs_norms={self.abs_norms}>, save_shape={self.save_shape}, save_raw_tensor={self.save_raw_tensor}" ) From 34084cc2d534e310733add30a5b6d44dc0e997fc Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 13:42:41 -0700 Subject: [PATCH 02/40] Add shape writer --- smdebug/core/hook.py | 16 +++--- smdebug/core/locations.py | 22 ++++++++ smdebug/core/writer.py | 95 ++++++++++++++++++++++++--------- smdebug/tensorflow/base_hook.py | 8 +++ 4 files changed, 109 insertions(+), 32 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index e243f6149..aaa640cc9 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -424,7 +424,6 @@ def _close_writers(self) -> None: self.writer = None to_delete_writers = [] - # Delete all the tb writers for mode, writer in self.tb_writers.items(): if writer is not None: @@ -434,6 +433,9 @@ def _close_writers(self) -> None: for mode in to_delete_writers: del self.tb_writers[mode] + self.shape_writer.close() + self.shape_writer = None + def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py if only_initialize_if_missing and self.writer: @@ -463,7 +465,9 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) if self._saving_shapes_in_step(): - self.shape_writer = + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, step=self.step, worker=self.worker + ) def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ @@ -734,15 +738,14 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_ if reduction_config.save_raw_tensor is True: self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref) break - + def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None): for s_col in save_collections: reduction_config = s_col.reduction_config - if reduction_config.save_shape is True: + if self.dry_run is False and reduction_config.save_shape is True: numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) - - self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref) + self.shape_writer.write_shape(tensor_name, this_shape) break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): @@ -761,7 +764,6 @@ def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, t timestamp=timestamp, ) - def _save_for_tensor(self, tensor_name, tensor_value, check_before_write=True): """ Identifies if this tensor should be saved for this step diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index af703e514..746a7a6cd 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -89,6 +89,28 @@ def get_step_dir_path(cls, trial_dir, step_num): return os.path.join(cls.get_dir(trial_dir), format(step_num, STEP_NUMBER_FORMATTING_LENGTH)) +class ShapeFileLocation(TensorFileLocation): + def __init__(self, step_num, worker_name): + super().__init__(step_num, worker_name) + + def get_filename(self): + step_num_str = self.get_step_num_str() + return f"{step_num_str}_{self.worker_name}_shapes.json" + + @classmethod + def load_filename(cls, s, print_error=True): + name = os.path.basename(s) + m = re.search("(.*)_(.*)_shapes.json$", name) + if m: + step_num = int(m.group(1)) + worker_name = m.group(2) + return cls(step_num=step_num, worker_name=worker_name) + else: + if print_error: + logger.error("Failed to load shape file location: ", s) + return None + + class TensorboardFileLocation(EventFileLocation): def __init__(self, step_num, worker_name, mode=None): super().__init__(step_num, worker_name) diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index a342cf433..5bda07c0c 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -16,6 +16,9 @@ # under the License. """APIs for logging data in the event file.""" +# Standard Library +from typing import Tuple + # First Party from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME from smdebug.core.tfevent.event_file_writer import EventFileWriter @@ -38,7 +41,36 @@ logger = get_logger() -class FileWriter: +class BaseWriter: + def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): + self.trial_dir = trial_dir + self.step = step + self.worker = worker + if worker is None: + assert False, "Worker should not be none. Check worker name initialization" + self.mode = mode + self._writer = None + + def name(self): + return self._writer.name() + + def __enter__(self): + """Make usable with "with" statement.""" + return self + + def __exit__(self, unused_type, unused_value, unused_traceback): + """Make usable with "with" statement.""" + self.close() + + def flush(self): + """Flushes the event file to disk. + Call this method to make sure that all pending events have been written to disk. + """ + self._writer.flush() + # don't flush index writer as we only want to flush on close + + +class FileWriter(BaseWriter): def __init__( self, trial_dir, @@ -71,12 +103,7 @@ def __init__( verbose : bool Determines whether to print logging messages. """ - self.trial_dir = trial_dir - self.step = step - self.worker = worker - if worker is None: - assert False, "Worker should not be none. Check worker name initialization" - self.mode = mode + super(FileWriter, self).__init__(trial_dir, worker, step, mode) if wtype == "events": el = TensorFileLocation(step_num=self.step, worker_name=self.worker) event_file_path = el.get_file_location(trial_dir=self.trial_dir) @@ -103,14 +130,6 @@ def __init__( ) self._default_bins = _get_default_bins() - def __enter__(self): - """Make usable with "with" statement.""" - return self - - def __exit__(self, unused_type, unused_value, unused_traceback): - """Make usable with "with" statement.""" - self.close() - @staticmethod def _get_metadata(mode, mode_step): sm2 = SummaryMetadata.PluginData(plugin_name=MODE_STEP_PLUGIN_NAME, content=str(mode_step)) @@ -187,13 +206,6 @@ def write_scalar_summary(self, name, value, global_step, timestamp: float = None s = scalar_summary(name, value) self._writer.write_summary(s, global_step, timestamp=timestamp) - def flush(self): - """Flushes the event file to disk. - Call this method to make sure that all pending events have been written to disk. - """ - self._writer.flush() - # don't flush index writer as we only want to flush on close - def close(self): """Flushes the event file to disk and close the file. Call this method when you do not need the summary writer anymore. @@ -202,9 +214,6 @@ def close(self): if self.index_writer is not None: self.index_writer.close() - def name(self): - return self._writer.name() - @staticmethod def _check_mode_step(mode, mode_step, global_step): if mode_step is None: @@ -216,3 +225,39 @@ def _check_mode_step(mode, mode_step, global_step): ex_str = "mode can be one of " + ", ".join(mode_keys) raise ValueError(ex_str) return mode, mode_step + + +class ShapeWriter(BaseWriter): + def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): + super(ShapeWriter, self).__init__(trial_dir, worker, step, mode) + el = ShapeFileLocation(step_num=self.step, worker_name=self.worker, mode=self.mode) + self.file_path = el.get_file_location(base_dir=self.trial_dir) + s3, bucket_name, key_name = is_s3(self.file_path) + if s3: + self._writer = TSAccessS3(bucket_name, key_name, binary=False) + else: + self._writer = TSAccessFile(self.file_path, "a+") + + self.shapes = [] + self.meta = {} + + def write_shape(self, name, shape: Tuple[int]): + self.shapes.append({"name": name, "shape": shape}) + + def flush(self): + if not self._writer: + raise ValueError(f"Cannot flush because self._writer={self._writer}") + if not self.shapes: + raise ValueError(f"Cannot write shapes to file {self.file_path} as it is empty") + + s = json.dumps({"meta": self.meta, "payload": self.shapes}) + self._writer.write(s) + self._writer.flush() + + def close(self): + """Flushes the event file to disk and close the file. + """ + if self._writer is not None: + self.flush() + self._writer.close() + self._writer = None diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index a8cc9f679..4c935d85c 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -341,6 +341,11 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: else: raise NotImplementedError + if self._saving_shapes_in_step(): + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, step=self.step, worker=self.worker + ) + def _close_writers(self) -> None: if self.dry_run: return @@ -373,6 +378,9 @@ def _close_writers(self) -> None: for mode in to_delete_writers: del self.tb_writers[mode] + self.shape_writer.close() + self.shape_writer = None + def _export_model(self): tb_writer = self._maybe_get_tb_writer() if tb_writer: From e8a6d988bc24a8e83543d6d7b752c7dc1a90bda0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 17:43:26 -0700 Subject: [PATCH 03/40] Add pytorch test --- smdebug/core/hook.py | 15 ++++-- smdebug/core/writer.py | 16 ++++-- tests/pytorch/test_reduce_config.py | 82 +++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 8 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index aaa640cc9..1cd8294d8 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -46,7 +46,7 @@ size_and_shape, validate_custom_tensor_value, ) -from smdebug.core.writer import FileWriter +from smdebug.core.writer import FileWriter, ShapeWriter from smdebug.exceptions import InvalidCollectionConfiguration try: @@ -222,7 +222,7 @@ def __init__( self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None - + self.shape_writer = None if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: @@ -433,8 +433,9 @@ def _close_writers(self) -> None: for mode in to_delete_writers: del self.tb_writers[mode] - self.shape_writer.close() - self.shape_writer = None + if self.shape_writer is not None: + self.shape_writer.close() + self.shape_writer = None def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py @@ -745,7 +746,11 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N if self.dry_run is False and reduction_config.save_shape is True: numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) - self.shape_writer.write_shape(tensor_name, this_shape) + if tensor_ref is not None: + name = tensor_ref.tf_obj.name + else: + name = tensor_name + self.shape_writer.write_shape(name, this_shape) break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 5bda07c0c..abb606632 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -17,9 +17,11 @@ """APIs for logging data in the event file.""" # Standard Library +import json from typing import Tuple # First Party +from smdebug.core.access_layer import TSAccessFile, TSAccessS3 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME from smdebug.core.tfevent.event_file_writer import EventFileWriter from smdebug.core.tfevent.index_file_writer import IndexWriter @@ -32,9 +34,15 @@ scalar_summary, ) from smdebug.core.tfevent.util import make_tensor_proto +from smdebug.core.utils import is_s3 # Local -from .locations import IndexFileLocationUtils, TensorboardFileLocation, TensorFileLocation +from .locations import ( + IndexFileLocationUtils, + ShapeFileLocation, + TensorboardFileLocation, + TensorFileLocation, +) from .logger import get_logger from .modes import ModeKeys @@ -230,8 +238,8 @@ def _check_mode_step(mode, mode_step, global_step): class ShapeWriter(BaseWriter): def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): super(ShapeWriter, self).__init__(trial_dir, worker, step, mode) - el = ShapeFileLocation(step_num=self.step, worker_name=self.worker, mode=self.mode) - self.file_path = el.get_file_location(base_dir=self.trial_dir) + el = ShapeFileLocation(step_num=self.step, worker_name=self.worker) + self.file_path = el.get_file_location(trial_dir=self.trial_dir) s3, bucket_name, key_name = is_s3(self.file_path) if s3: self._writer = TSAccessS3(bucket_name, key_name, binary=False) @@ -253,6 +261,8 @@ def flush(self): s = json.dumps({"meta": self.meta, "payload": self.shapes}) self._writer.write(s) self._writer.flush() + self.meta = {} + self.shapes = [] def close(self): """Flushes the event file to disk and close the file. diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index 230e0410c..c7b525895 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -1,13 +1,18 @@ # Standard Library +import json import os import shutil from datetime import datetime # Third Party import torch +import torch.nn as nn +import torch.nn.functional as F import torch.optim as optim # First Party +from smdebug.core.config_constants import DEFAULT_WORKER_NAME +from smdebug.core.locations import ShapeFileLocation from smdebug.pytorch import ReductionConfig, SaveConfig from smdebug.pytorch.hook import Hook as t_hook from smdebug.trials import create_trial @@ -86,6 +91,83 @@ def test_reduce_config(hook=None, out_dir=None): shutil.rmtree(out_dir) +def test_save_shapes(hook=None, out_dir=None): + class ChildA(nn.Module): + def __init__(self): + super(ChildA, self).__init__() + self.child2 = ChildB() + self.relu0 = nn.ReLU() + + def forward(self, x): + return self.relu0(self.child2(x)) + + class ChildB(nn.Module): + def __init__(self): + super(ChildB, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + + def forward(self, x): + return self.conv1(x) + + class NestedNet(nn.Module): + def __init__(self): + super(NestedNet, self).__init__() + self.child1 = ChildA() + self.max_pool = nn.MaxPool2d(2, stride=2) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + relu_module = nn.ReLU() + self.relu1 = nn.ReLU() + self.max_pool2 = nn.MaxPool2d(2, stride=2) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.relu2 = nn.ReLU() + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = self.child1(x) + x = self.max_pool(x) + x = self.relu1(self.conv2(x)) + x = self.max_pool2(x) + x = x.view(-1, 4 * 4 * 50) + x = self.relu2(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + hook_created = False + if hook is None: + global_reduce_config = ReductionConfig(save_shape=True) + global_save_config = SaveConfig(save_steps=[0]) + + run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") + out_dir = "/tmp/" + run_id + hook = t_hook( + out_dir=out_dir, + save_config=global_save_config, + save_all=True, + reduction_config=global_reduce_config, + ) + hook_created = True + + model = NestedNet().to(torch.device("cpu")) + hook.register_module(model) + optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) + train(model, hook, torch.device("cpu"), optimizer, num_steps=10) + + sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME) + path = os.path.join(out_dir, sl.get_file_location()) + with open(path) as jsfile: + shape_dict = json.load(jsfile) + print(shape_dict["payload"]) + assert "payload" in shape_dict + assert len(shape_dict["payload"]) == 41 + for ts in shape_dict["payload"]: + for dim in ts["shape"]: + assert isinstance(dim, int) + assert isinstance(ts["name"], str) + + if hook_created: + shutil.rmtree(out_dir) + + # Test creating hook by loading the json file with reduction configs. def test_reduce_config_with_json(): from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR From 907cf640cff2ba895f8f2bf9cbab23aa42e33a6c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 18:30:15 -0700 Subject: [PATCH 04/40] Add untested keras test --- tests/tensorflow2/test_keras.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4323b5284..ce2965514 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -183,6 +183,29 @@ def helper_keras_gradtape( hook.close() +def test_keras_gradtape_shapes(out_dir): + hook = smd.KerasHook( + out_dir=out_dir, + save_all=saveall, + save_config=SaveConfig(save_steps=[0], reduce_config=ReductionConfig(save_shape=True)), + ) + helper_keras_gradtape(trial_dir=out_dir, hook=hook) + sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME) + path = os.path.join(out_dir, sl.get_file_location()) + with open(path) as jsfile: + shape_dict = json.load(jsfile) + print(shape_dict["payload"]) + assert "payload" in shape_dict + assert len(shape_dict["payload"]) == 41 + for ts in shape_dict["payload"]: + for dim in ts["shape"]: + assert isinstance(dim, int) + assert isinstance(ts["name"], str) + + if hook_created: + shutil.rmtree(out_dir) + + @pytest.mark.skip_if_non_eager @pytest.mark.slow @pytest.mark.parametrize("saveall", [True, False]) From 86842e6f57be54a9ea686a9edf0a6f9f27d0e552 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 18:43:15 -0700 Subject: [PATCH 05/40] fix syntax --- smdebug/tensorflow/base_hook.py | 2 +- tests/tensorflow2/test_keras.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 4c935d85c..d2512623a 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -16,7 +16,7 @@ from smdebug.core.reductions import get_numpy_reduction, get_reduction_tensor_name from smdebug.core.tfevent.util import make_numpy_array from smdebug.core.utils import serialize_tf_device -from smdebug.core.writer import FileWriter +from smdebug.core.writer import FileWriter, ShapeWriter # Local from .collection import CollectionKeys, CollectionManager diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ce2965514..af355dafe 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -186,8 +186,9 @@ def helper_keras_gradtape( def test_keras_gradtape_shapes(out_dir): hook = smd.KerasHook( out_dir=out_dir, - save_all=saveall, - save_config=SaveConfig(save_steps=[0], reduce_config=ReductionConfig(save_shape=True)), + save_all=True, + save_config=SaveConfig(save_steps=[0]), + reduction_config=ReductionConfig(save_shape=True), ) helper_keras_gradtape(trial_dir=out_dir, hook=hook) sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME) From 651c4408394a4adff6bc75589a8f0f5e42fc1718 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 18:45:02 -0700 Subject: [PATCH 06/40] fix syntax --- smdebug/tensorflow/base_hook.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index d2512623a..42b9b07d0 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -378,8 +378,9 @@ def _close_writers(self) -> None: for mode in to_delete_writers: del self.tb_writers[mode] - self.shape_writer.close() - self.shape_writer = None + if self.shape_writer is not None: + self.shape_writer.close() + self.shape_writer = None def _export_model(self): tb_writer = self._maybe_get_tb_writer() From fc25940a8b1e2fdb898c55cf5ba248dc38acf075 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 18:46:22 -0700 Subject: [PATCH 07/40] Import --- tests/tensorflow2/test_keras.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index af355dafe..4c41433af 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -23,6 +23,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR +from smdebug.core.locations import ShapeFileLocation from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS from smdebug.exceptions import TensorUnavailableForStep From 1357f5d025018829262bc9952f0f0a3179700a3c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 10 Aug 2020 18:47:25 -0700 Subject: [PATCH 08/40] Import --- tests/tensorflow2/test_keras.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 4c41433af..f69a5a241 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -22,6 +22,7 @@ import smdebug.tensorflow as smd from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys +from smdebug.core.config_constants import DEFAULT_WORKER_NAME from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.locations import ShapeFileLocation from smdebug.core.modes import ModeKeys From 44358ee6f7aeebc82e9aa3707884dd3208da0891 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 12 Aug 2020 11:52:28 -0700 Subject: [PATCH 09/40] Add tests for TF --- smdebug/core/writer.py | 11 ++++--- smdebug/tensorflow/base_hook.py | 9 ++++-- tests/pytorch/test_reduce_config.py | 18 ++--------- .../tensorflow/hooks/test_estimator_modes.py | 27 +++++++++++++++- tests/tensorflow/hooks/test_reductions.py | 16 ++++++++++ tests/tensorflow2/test_keras.py | 31 +++++++++---------- tests/utils.py | 20 ++++++++++++ 7 files changed, 91 insertions(+), 41 deletions(-) diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index abb606632..5d5cb93d5 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -256,7 +256,9 @@ def flush(self): if not self._writer: raise ValueError(f"Cannot flush because self._writer={self._writer}") if not self.shapes: - raise ValueError(f"Cannot write shapes to file {self.file_path} as it is empty") + raise ValueError( + f"Cannot write shapes to file {self.file_path} as it is empty. {self.shapes}" + ) s = json.dumps({"meta": self.meta, "payload": self.shapes}) self._writer.write(s) @@ -268,6 +270,7 @@ def close(self): """Flushes the event file to disk and close the file. """ if self._writer is not None: - self.flush() - self._writer.close() - self._writer = None + if self.shapes: + self.flush() + self._writer.close() + self._writer = None diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 42b9b07d0..0cf30bf48 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -342,9 +342,12 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: raise NotImplementedError if self._saving_shapes_in_step(): - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, step=self.step, worker=self.worker - ) + if self.shape_writer is None or only_initialize_if_missing is False: + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, step=self.step, worker=self.worker + ) + else: + assert self.shape_writer is None def _close_writers(self) -> None: if self.dry_run: diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index c7b525895..7390f335f 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -1,5 +1,4 @@ # Standard Library -import json import os import shutil from datetime import datetime @@ -9,10 +8,9 @@ import torch.nn as nn import torch.nn.functional as F import torch.optim as optim +from tests.utils import verify_shapes # First Party -from smdebug.core.config_constants import DEFAULT_WORKER_NAME -from smdebug.core.locations import ShapeFileLocation from smdebug.pytorch import ReductionConfig, SaveConfig from smdebug.pytorch.hook import Hook as t_hook from smdebug.trials import create_trial @@ -151,19 +149,7 @@ def forward(self, x): hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) - - sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME) - path = os.path.join(out_dir, sl.get_file_location()) - with open(path) as jsfile: - shape_dict = json.load(jsfile) - print(shape_dict["payload"]) - assert "payload" in shape_dict - assert len(shape_dict["payload"]) == 41 - for ts in shape_dict["payload"]: - for dim in ts["shape"]: - assert isinstance(dim, int) - assert isinstance(ts["name"], str) - + verify_shapes(out_dir, 0, 4) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py index e7bd40945..fe481ace4 100644 --- a/tests/tensorflow/hooks/test_estimator_modes.py +++ b/tests/tensorflow/hooks/test_estimator_modes.py @@ -18,6 +18,7 @@ import pytest import tensorflow as tf from tests.analysis.utils import delete_s3_prefix +from tests.utils import verify_shapes # First Party import smdebug.tensorflow as smd @@ -30,10 +31,12 @@ def help_test_mnist( path, save_config=None, + reduction_config=None, hook=None, set_modes=True, num_steps=10, num_eval_steps=None, + save_all=False, steps=None, include_collections=None, ): @@ -125,7 +128,11 @@ def cnn_model_fn(features, labels, mode): if include_collections is None: include_collections = ["weights", "gradients", "default", "losses"] hook = smd.SessionHook( - out_dir=trial_dir, save_config=save_config, include_collections=include_collections + out_dir=trial_dir, + save_config=save_config, + include_collections=include_collections, + save_all=save_all, + reduction_config=reduction_config, ) if num_eval_steps is None: @@ -187,6 +194,24 @@ def test_mnist(out_dir, on_s3=False): helper_test_mnist_trial(out_dir) +@pytest.mark.slow # 0:02 to run +def test_mnist_shapes(out_dir, on_s3=False): + if on_s3: + run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") + bucket = "smdebug-testing" + prefix = "outputs/hooks/estimator_modes/" + run_id + out_dir = f"s3://{bucket}/{prefix}" + help_test_mnist( + out_dir, + save_all=True, + save_config=smd.SaveConfig(save_steps=[0]), + num_steps=1, + steps=None, + reduction_config=smd.ReductionConfig(save_shape=True), + ) + verify_shapes(out_dir, 0, 249) + + @pytest.mark.slow # 0:02 to run def test_mnist_local_json(out_dir, monkeypatch): monkeypatch.setenv( diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index 4fde66a49..b27589ff6 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -1,5 +1,8 @@ # Standard Library +# Third Party +from tests.tensorflow2.utils import verify_shapes + # First Party import smdebug.tensorflow as smd from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR @@ -57,6 +60,19 @@ def test_reductions(out_dir, save_raw_tensor=False): helper_test_reductions(out_dir, hook, save_raw_tensor) +def test_shapes(out_dir, save_raw_tensor=False): + pre_test_clean_up() + rdnc = smd.ReductionConfig(save_shape=True, save_raw_tensor=save_raw_tensor) + hook = smd.SessionHook( + out_dir=out_dir, + save_config=smd.SaveConfig(save_interval=1), + reduction_config=rdnc, + include_collections=["weights", "gradients", "losses"], + ) + simple_model(hook) + verify_shapes(out_dir, 0, 2) + + def test_reductions_with_raw_tensor(out_dir): test_reductions(out_dir, save_raw_tensor=True) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index f69a5a241..6e9d932ec 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -16,15 +16,13 @@ from tests.constants import TEST_DATASET_S3_PATH from tests.tensorflow2.utils import is_tf_2_2, is_tf_2_3 from tests.tensorflow.utils import create_trial_fast_refresh -from tests.utils import use_s3_datasets +from tests.utils import use_s3_datasets, verify_shapes # First Party import smdebug.tensorflow as smd from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys -from smdebug.core.config_constants import DEFAULT_WORKER_NAME from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR -from smdebug.core.locations import ShapeFileLocation from smdebug.core.modes import ModeKeys from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS from smdebug.exceptions import TensorUnavailableForStep @@ -193,20 +191,8 @@ def test_keras_gradtape_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_gradtape(trial_dir=out_dir, hook=hook) - sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME) - path = os.path.join(out_dir, sl.get_file_location()) - with open(path) as jsfile: - shape_dict = json.load(jsfile) - print(shape_dict["payload"]) - assert "payload" in shape_dict - assert len(shape_dict["payload"]) == 41 - for ts in shape_dict["payload"]: - for dim in ts["shape"]: - assert isinstance(dim, int) - assert isinstance(ts["name"], str) - - if hook_created: - shutil.rmtree(out_dir) + verify_shapes(out_dir, 0, 9) + verify_shapes(out_dir, 500, 14) @pytest.mark.skip_if_non_eager @@ -479,6 +465,17 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall): assert trial.tensor(tname).value(0) is not None +def test_keras_fit_shapes(out_dir): + hook = smd.KerasHook( + out_dir=out_dir, + save_all=True, + save_config=SaveConfig(save_steps=[0]), + reduction_config=ReductionConfig(save_shape=True), + ) + helper_keras_fit(trial_dir=out_dir, hook=hook) + verify_shapes(out_dir, 0, 9) + + @pytest.mark.slow def test_base_reductions(out_dir, tf_eager_mode): helper_keras_fit( diff --git a/tests/utils.py b/tests/utils.py index d5db2a8ba..4472cf7bc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ # Standard Library +import json import os import shutil from pathlib import Path @@ -12,8 +13,10 @@ CONFIG_FILE_PATH_ENV_STR, DEFAULT_SAGEMAKER_OUTDIR, DEFAULT_SAGEMAKER_TENSORBOARD_PATH, + DEFAULT_WORKER_NAME, TENSORBOARD_CONFIG_FILE_PATH_ENV_STR, ) +from smdebug.core.locations import ShapeFileLocation from smdebug.core.utils import is_s3, remove_file_if_exists @@ -27,6 +30,23 @@ def use_s3_datasets(): return False +def verify_shapes(out_dir, step_num, num_tensors): + sl = ShapeFileLocation(step_num, DEFAULT_WORKER_NAME) + path = os.path.join(out_dir, sl.get_file_location()) + with open(path) as jsfile: + shape_dict = json.load(jsfile) + print(shape_dict["payload"]) + assert "payload" in shape_dict + assert len(shape_dict["payload"]) == num_tensors, ( + len(shape_dict["payload"]), + shape_dict["payload"], + ) + for ts in shape_dict["payload"]: + for dim in ts["shape"]: + assert isinstance(dim, int) + assert isinstance(ts["name"], str) + + class SagemakerSimulator(object): """ Creates an environment variable pointing to a JSON config file, and creates the config file. From f146c77bcdc0f0020caf3b56f8c197a699ed4098 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 12 Aug 2020 17:17:29 -0700 Subject: [PATCH 10/40] Simplify read code --- smdebug/trials/local_trial.py | 2 +- smdebug/trials/s3_trial.py | 2 +- smdebug/trials/trial.py | 18 +++--------------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/smdebug/trials/local_trial.py b/smdebug/trials/local_trial.py index 0fa4e76ba..e8cfbe51e 100644 --- a/smdebug/trials/local_trial.py +++ b/smdebug/trials/local_trial.py @@ -34,7 +34,7 @@ def __init__( self.index_reader = LocalIndexReader(self.path) self.logger.info(f"Loading trial {name} at path {self.trial_dir}") self._load_collections() - self._load_tensors() + self.refresh_data() def _get_collection_files(self) -> list: return list_collection_files_in_directory(get_path_to_collections(self.path)) diff --git a/smdebug/trials/s3_trial.py b/smdebug/trials/s3_trial.py index 0a3cb6389..c2751fb40 100644 --- a/smdebug/trials/s3_trial.py +++ b/smdebug/trials/s3_trial.py @@ -45,7 +45,7 @@ def __init__( self.path = "s3://" + os.path.join(self.bucket_name, self.prefix_name) self.index_reader = S3IndexReader(self.path) self._load_collections() - self._load_tensors() + self.refresh_data() def _get_collection_files(self) -> list: collection_files, _ = list_s3_objects( diff --git a/smdebug/trials/trial.py b/smdebug/trials/trial.py index 5008ef6a7..2d527305e 100644 --- a/smdebug/trials/trial.py +++ b/smdebug/trials/trial.py @@ -190,7 +190,7 @@ def maybe_refresh(self, name=None): retry_count = 2 while retry_count > 0: if name is None: - self.refresh_tensors() + self.refresh_data() else: self.refresh_tensor(name) if retry_count > 1: @@ -215,7 +215,7 @@ def maybe_refresh(self, name=None): def refresh_tensor(self, tname, steps=None): # for now we load all tensors at once - self.refresh_tensors() + self.refresh_data() def tensor(self, tname): # will not show tensor if it was not written yet @@ -560,10 +560,6 @@ def has_passed_step(self, step, mode=ModeKeys.GLOBAL) -> StepState: return StepState.UNAVAILABLE return StepState.NOT_YET_AVAILABLE - def _load_tensors(self): - if self.index_mode: - self._load_tensors_from_index_files() - def _update_last_index_token(self, new_index_token: str) -> None: """ This function updates the last_index_token in the following scenarios: @@ -625,15 +621,7 @@ def _update_last_index_token(self, new_index_token: str) -> None: f"Updating last_complete_step to: {self.last_complete_step}. " ) - def _load_tensors_from_index_files(self): - self.index_tensors_dict, new_index_token = self.index_reader.load_tensor_data_from_index_files( - start_after_key=self.last_index_token, range_steps=self.range_steps - ) - self._load_tensors_from_index_tensors(self.index_tensors_dict) - if new_index_token: # new index token can be None if there are no new index files - self._update_last_index_token(new_index_token) - - def refresh_tensors(self): + def refresh_data(self): # TODO if job finished if self.index_mode: index_tensors_dict, new_index_token = self.index_reader.load_tensor_data_from_index_files( From 5906e5aeaff65323080129806efcf9af2bfced79 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 13 Aug 2020 16:11:41 -0700 Subject: [PATCH 11/40] Add read API and tests --- smdebug/core/hook.py | 18 +++-- smdebug/core/index_reader.py | 48 +++++++++---- smdebug/core/locations.py | 36 ++++------ smdebug/core/reduction_config.py | 24 +++++-- smdebug/core/tensor.py | 35 +++++++++- smdebug/core/tfevent/index_file_writer.py | 36 +++++++--- smdebug/core/writer.py | 69 +++++++++---------- smdebug/exceptions.py | 15 ++++ smdebug/tensorflow/base_hook.py | 38 +++++++--- smdebug/trials/local_trial.py | 6 -- smdebug/trials/s3_trial.py | 6 -- smdebug/trials/trial.py | 84 ++++++++++++++--------- tests/pytorch/test_reduce_config.py | 2 +- tests/tensorflow/keras/test_keras.py | 15 ++++ tests/tensorflow2/test_keras.py | 6 +- tests/utils.py | 22 ++---- 16 files changed, 296 insertions(+), 164 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 1cd8294d8..5256c9adf 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -463,11 +463,15 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: if self.save_all_workers is False: if self.worker != self.chief_worker: return + self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) if self._saving_shapes_in_step(): self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, step=self.step, worker=self.worker + trial_dir=self.out_dir, + step=self.step, + worker=self.worker, + index_writer=self.writer.index_writer, ) def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: @@ -747,10 +751,16 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) if tensor_ref is not None: - name = tensor_ref.tf_obj.name + original_name = tensor_ref.tf_obj.name else: - name = tensor_name - self.shape_writer.write_shape(name, this_shape) + original_name = None + self.shape_writer.write_shape( + tensor_name, + this_shape, + self.mode, + self.mode_steps[self.mode], + original_name=original_name, + ) break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index 3b23e1671..58256dd65 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -16,7 +16,7 @@ MISSING_EVENT_FILE_RETRY_LIMIT, MISSING_EVENT_FILE_RETRY_LIMIT_KEY, ) -from smdebug.core.locations import IndexFileLocationUtils, TensorLocation +from smdebug.core.locations import IndexFileLocationUtils, TensorLocation, TensorShape from smdebug.core.logger import get_logger from smdebug.core.modes import ModeKeys from smdebug.core.s3_utils import list_s3_objects @@ -203,8 +203,10 @@ def _validate(index_dict): raise IndexReaderException("meta section is not present") if len(index_dict["meta"]) == 0: raise IndexReaderException("meta section is empty") - if "tensor_payload" not in index_dict: - raise IndexReaderException("tensor_payload section is not present") + if "tensor_payload" not in index_dict and "shape_payload" not in index_dict: + raise IndexReaderException( + "neither tensor_payload nor shape_payload sections are present" + ) def _update_tensors_from_json( self, index_tensors_dict, step, response: bytes, path, worker @@ -233,28 +235,44 @@ def _update_tensors_from_json( mode = index_meta["mode"] mode = ModeKeys[mode.strip()] mode_step = index_meta["mode_step"] - event_file_name = os.path.join(path, index_meta["event_file_name"]) - tensors = index_dict["tensor_payload"] - for tensor in tensors: + + if "event_file_name" in index_meta: + event_file_name = os.path.join(path, index_meta["event_file_name"]) + else: + event_file_name = None + + tensor_payload = index_dict["tensor_payload"] + to_update_index_dict = [] + for tensor in tensor_payload: tensor_name = tensor["tensorname"] start_idx = tensor["start_idx"] length = tensor["length"] tensor_location = TensorLocation( tensor_name, mode, mode_step, event_file_name, start_idx, length, worker ) + to_update_index_dict.append((tensor_name, step, tensor_location)) + + shape_payload = index_dict["shape_payload"] + for tensor in shape_payload: + tensor_name = tensor["tensorname"] + original_name = tensor["originalname"] + shape = tensor["shape"] + ts = TensorShape(tensor_name, mode, mode_step, shape, original_name) + to_update_index_dict.append((tensor_name, step, ts)) + + for tu in to_update_index_dict: + tensor_name, step, obj = tu + if isinstance(obj, TensorLocation): + obj_dict = {"tensor_location": obj} + elif isinstance(obj, TensorShape): + obj_dict = {"tensor_shape": obj} if tensor_name in index_tensors_dict: if step in index_tensors_dict[tensor_name]: - index_tensors_dict[tensor_name][step].update( - {worker: {"tensor_location": tensor_location}} - ) + index_tensors_dict[tensor_name][step].update({worker: obj_dict}) else: - index_tensors_dict[tensor_name].update( - {step: {worker: {"tensor_location": tensor_location}}} - ) + index_tensors_dict[tensor_name].update({step: {worker: obj_dict}}) else: - index_tensors_dict[tensor_name] = { - step: {worker: {"tensor_location": tensor_location}} - } + index_tensors_dict[tensor_name] = {step: {worker: obj_dict}} return index_tensors_dict diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 746a7a6cd..9712f58a2 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -24,6 +24,20 @@ def to_dict(self): return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length} +class TensorShape: + def __init__(self, name, mode, mode_step, shape, original_name=None): + if original_name is None: + original_name = name + self.name = name + self.original_name = original_name + self.mode = mode + self.mode_step = mode_step + self.shape = tuple(shape) + + def to_dict(self): + return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape} + + STEP_NUMBER_FORMATTING_LENGTH = "012" @@ -89,28 +103,6 @@ def get_step_dir_path(cls, trial_dir, step_num): return os.path.join(cls.get_dir(trial_dir), format(step_num, STEP_NUMBER_FORMATTING_LENGTH)) -class ShapeFileLocation(TensorFileLocation): - def __init__(self, step_num, worker_name): - super().__init__(step_num, worker_name) - - def get_filename(self): - step_num_str = self.get_step_num_str() - return f"{step_num_str}_{self.worker_name}_shapes.json" - - @classmethod - def load_filename(cls, s, print_error=True): - name = os.path.basename(s) - m = re.search("(.*)_(.*)_shapes.json$", name) - if m: - step_num = int(m.group(1)) - worker_name = m.group(2) - return cls(step_num=step_num, worker_name=worker_name) - else: - if print_error: - logger.error("Failed to load shape file location: ", s) - return None - - class TensorboardFileLocation(EventFileLocation): def __init__(self, step_num, worker_name, mode=None): super().__init__(step_num, worker_name) diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py index 6ad78cdee..1fa6121b6 100644 --- a/smdebug/core/reduction_config.py +++ b/smdebug/core/reduction_config.py @@ -1,16 +1,25 @@ # Standard Library import json from typing import Any, Dict -from smdebug.core.logger import get_logger -logger = get_logger() # First Party +from smdebug.core.logger import get_logger from smdebug.core.utils import split +logger = get_logger() + + ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"] ALLOWED_NORMS = ["l1", "l2"] REDUCTION_CONFIG_VERSION_NUM = "v0" -ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor", "save_shape"] +ALLOWED_PARAMS = [ + "reductions", + "abs_reductions", + "norms", + "abs_norms", + "save_raw_tensor", + "save_shape", +] class ReductionConfig: @@ -82,7 +91,6 @@ def _check(self): if not isinstance(self.save_shape, bool): raise ValueError(f"save_shape={self.save_shape} must be a boolean") - @classmethod def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig": """Parses a flattened dict with two keys: `save_raw_tensor` and `reductions`.""" @@ -115,7 +123,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig": norms=norms, abs_norms=abs_norms, save_raw_tensor=save_raw_tensor, - save_shape=save_shape + save_shape=save_shape, ) @classmethod @@ -136,7 +144,11 @@ def to_json_dict(self) -> Dict[str, Any]: all_reductions.append(f"abs_{red}_norm") all_reductions_str = ",".join(all_reductions) # Return the dict - return {"save_raw_tensor": self.save_raw_tensor, "reductions": all_reductions_str, "save_shape": self.save_shape} + return { + "save_raw_tensor": self.save_raw_tensor, + "reductions": all_reductions_str, + "save_shape": self.save_shape, + } def to_json(self) -> str: return json.dumps(self.to_json_dict()) diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py index c268c48fc..de52f7858 100644 --- a/smdebug/core/tensor.py +++ b/smdebug/core/tensor.py @@ -10,6 +10,7 @@ from smdebug.exceptions import ( InvalidWorker, NoMoreData, + ShapeUnavailableForStep, StepNotYetAvailable, StepUnavailable, TensorUnavailableForStep, @@ -62,6 +63,16 @@ def set_step_location(self, step_num, worker, location): s = self._steps[step_num][worker] s.location = location + def set_step_shape(self, step_num, worker, shape): + step = Step(step_num, shape=shape) + if step_num not in self._steps: + self._steps[step_num] = {worker: step} + elif worker not in self._steps[step_num]: + self._steps[step_num].update({worker: step}) + + s = self._steps[step_num][worker] + s.shape = shape + def set_step_reduction_value(self, step_num, worker, red_name, abs, red_value): if step_num not in self._steps: s = Step(step_num) @@ -88,10 +99,11 @@ def step(self, step_num): class Step: """Contains the step number, value, location, and reduction values/locations.""" - def __init__(self, step_num, value=None, location=None): + def __init__(self, step_num, value=None, location=None, shape=None): self.step_num = step_num self.value = value self.location = location + self.shape = shape # mapping from (red_name, abs) to value self._reduction_values = {} @@ -126,6 +138,9 @@ class Tensor: def __init__(self, name, trial, cache): self._mode_steps = {} self.name = name + # SMdebug modifies some names of tensors to be more descriptive + # In such cases we save here the original name + self.original_name = None self.trial = trial self.cache = cache @@ -264,6 +279,16 @@ def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None): has_reductions = has_reduction_locations or has_reduction_values raise TensorUnavailableForStep(self.name, step_num, mode, has_reductions) + def shape(self, step_num, mode=ModeKeys.GLOBAL, worker=None): + s = self._step(step_num=step_num, mode=mode, worker=worker) + if s.shape is not None: + return s.shape + try: + value = self.value(step_num, mode, worker) + return value.shape + except TensorUnavailableForStep: + raise ShapeUnavailableForStep(self.name, step_num, mode) + def reduction_values(self, step_num, mode=ModeKeys.GLOBAL, worker=None): s = self._step(step_num=step_num, mode=mode, worker=worker) if s is not None: @@ -334,9 +359,13 @@ def _create_mode_step(self, mode, mode_step): if mode not in self._mode_steps: self._mode_steps[mode] = ModeSteps(mode) - def add_step(self, mode, mode_step, worker, location): + def add_step(self, mode, mode_step, worker, tensor_location, tensor_shape): self._create_mode_step(mode, mode_step) - self._mode_steps[mode].set_step_location(mode_step, worker, location) + if tensor_location is not None: + self._mode_steps[mode].set_step_location(mode_step, worker, tensor_location) + if tensor_shape is not None: + self._mode_steps[mode].set_step_shape(mode_step, worker, tensor_shape.shape) + self.original_name = tensor_shape.original_name def add_reduction_step(self, mode, mode_step, worker, red_name, abs, red_location): self._create_mode_step(mode, mode_step) diff --git a/smdebug/core/tfevent/index_file_writer.py b/smdebug/core/tfevent/index_file_writer.py index 80cb54b68..886c7760e 100644 --- a/smdebug/core/tfevent/index_file_writer.py +++ b/smdebug/core/tfevent/index_file_writer.py @@ -13,6 +13,7 @@ def __init__(self, file_path): self.file_path = file_path self.index_payload = [] self.index_meta = {} + self.shape_payload = [] self.writer = None def __exit__(self): @@ -28,7 +29,7 @@ def _init_writer(self): def add_index(self, tensorlocation): if not self.writer: self._init_writer() - if not self.index_meta: + if not self.index_meta or not "event_file_name" in self.index_meta: self.index_meta = { "mode": tensorlocation.mode, "mode_step": tensorlocation.mode_step, @@ -36,6 +37,13 @@ def add_index(self, tensorlocation): } self.index_payload.append(tensorlocation.to_dict()) + def add_shape(self, tensorshape): + if not self.writer: + self._init_writer() + if not self.index_meta: + self.index_meta = {"mode": tensorshape.mode, "mode_step": tensorshape.mode_step} + self.shape_payload.append(tensorshape.to_dict()) + def flush(self): """Flushes the event string to file.""" if not self.writer: @@ -44,33 +52,45 @@ def flush(self): raise ValueError( f"Cannot write empty index_meta={self.index_meta} to file {self.file_path}" ) - if not self.index_payload: + if not self.index_payload and not self.shape_payload: raise ValueError( - f"Cannot write empty index_payload={self.index_payload} to file {self.file_path}" + f"Cannot write empty payload: index_payload={self.index_payload}, shape_payload={self.shape_payload} to file {self.file_path}" ) - index = Index(meta=self.index_meta, tensor_payload=self.index_payload) + index = Index( + meta=self.index_meta, + tensor_payload=self.index_payload, + shape_payload=self.shape_payload, + ) self.writer.write(index.to_json()) self.writer.flush() self.index_meta = {} - self.index_payload = {} + self.index_payload = [] + self.shape_payload = [] def close(self): """Closes the record writer.""" if self.writer is not None: - if self.index_meta and self.index_payload: + if self.index_meta and (self.index_payload or self.shape_payload): self.flush() self.writer.close() self.writer = None class Index: - def __init__(self, meta=None, tensor_payload=None): + def __init__(self, meta=None, tensor_payload=None, shape_payload=None): self.meta = meta self.tensor_payload = tensor_payload + self.shape_payload = shape_payload def to_json(self): - return json.dumps({"meta": self.meta, "tensor_payload": self.tensor_payload}) + return json.dumps( + { + "meta": self.meta, + "tensor_payload": self.tensor_payload, + "shape_payload": self.shape_payload, + } + ) class EventWithIndex(object): diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 5d5cb93d5..4932e72e0 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -39,9 +39,9 @@ # Local from .locations import ( IndexFileLocationUtils, - ShapeFileLocation, TensorboardFileLocation, TensorFileLocation, + TensorShape, ) from .logger import get_logger from .modes import ModeKeys @@ -58,6 +58,7 @@ def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): assert False, "Worker should not be none. Check worker name initialization" self.mode = mode self._writer = None + self._index_writer = None def name(self): return self._writer.name() @@ -77,6 +78,21 @@ def flush(self): self._writer.flush() # don't flush index writer as we only want to flush on close + @classmethod + def create_index_writer(cls, trial_dir, worker, step): + el = TensorFileLocation(step_num=step, worker_name=worker) + event_file_path = el.get_file_location(trial_dir=trial_dir) + index_file_path = IndexFileLocationUtils.get_index_key_for_step(trial_dir, step, worker) + return IndexWriter(index_file_path) + + @property + def index_writer(self): + return self._index_writer + + @index_writer.setter + def index_writer(self, iw): + self._index_writer = iw + class FileWriter(BaseWriter): def __init__( @@ -90,6 +106,7 @@ def __init__( flush_secs=120, verbose=False, write_checksum=False, + index_writer=None, ): """Creates a `FileWriter` and an file. On construction the summary writer creates a new event file in `trial_dir`. @@ -113,12 +130,14 @@ def __init__( """ super(FileWriter, self).__init__(trial_dir, worker, step, mode) if wtype == "events": + if index_writer is None: + self.index_writer = self.create_index_writer( + trial_dir=trial_dir, worker=worker, step=step + ) + else: + self.index_writer = index_writer el = TensorFileLocation(step_num=self.step, worker_name=self.worker) event_file_path = el.get_file_location(trial_dir=self.trial_dir) - index_file_path = IndexFileLocationUtils.get_index_key_for_step( - self.trial_dir, self.step, self.worker - ) - self.index_writer = IndexWriter(index_file_path) elif wtype == "tensorboard": el = TensorboardFileLocation( step_num=self.step, worker_name=self.worker, mode=self.mode @@ -236,41 +255,21 @@ def _check_mode_step(mode, mode_step, global_step): class ShapeWriter(BaseWriter): - def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): + def __init__(self, trial_dir, worker, index_writer, step=0, mode=ModeKeys.GLOBAL): super(ShapeWriter, self).__init__(trial_dir, worker, step, mode) - el = ShapeFileLocation(step_num=self.step, worker_name=self.worker) - self.file_path = el.get_file_location(trial_dir=self.trial_dir) - s3, bucket_name, key_name = is_s3(self.file_path) - if s3: - self._writer = TSAccessS3(bucket_name, key_name, binary=False) - else: - self._writer = TSAccessFile(self.file_path, "a+") + self._index_writer = index_writer - self.shapes = [] - self.meta = {} - - def write_shape(self, name, shape: Tuple[int]): - self.shapes.append({"name": name, "shape": shape}) + def write_shape( + self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None + ): + self._index_writer.add_shape( + TensorShape(name, mode, mode_step, shape, original_name=original_name) + ) def flush(self): - if not self._writer: - raise ValueError(f"Cannot flush because self._writer={self._writer}") - if not self.shapes: - raise ValueError( - f"Cannot write shapes to file {self.file_path} as it is empty. {self.shapes}" - ) - - s = json.dumps({"meta": self.meta, "payload": self.shapes}) - self._writer.write(s) - self._writer.flush() - self.meta = {} - self.shapes = [] + self._index_writer.flush() def close(self): """Flushes the event file to disk and close the file. """ - if self._writer is not None: - if self.shapes: - self.flush() - self._writer.close() - self._writer = None + self._index_writer.close() diff --git a/smdebug/exceptions.py b/smdebug/exceptions.py index 6d917e6bf..e2ed43da9 100644 --- a/smdebug/exceptions.py +++ b/smdebug/exceptions.py @@ -68,6 +68,21 @@ def __str__(self): return msg +class ShapeUnavailableForStep(Exception): + def __init__(self, tname, step, mode=modes.GLOBAL): + self.step = step + self.mode = mode + self.tname = tname + + def __str__(self): + msg = ( + "Shape for tensor {} is not available for step {} " + "with mode {} as it was not saved." + "".format(self.tname, self.step, self.mode.name) + ) + return msg + + class TensorUnavailable(Exception): def __init__(self, tname): self.tname = tname diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 0cf30bf48..02c69b36f 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -86,6 +86,7 @@ def __init__( Example -> /job:worker/replica:0/task:1/device:GPU:0 : _job-worker_replica-0_task-1_device-GPU-0""" self.device_map = {} self.writer_map = {} + self.shape_writer_map = {} # This will be None if the var wasn't set, i.e. not param server self.tf_config_json = load_tf_config_json(os.getenv("TF_CONFIG")) self._hook_supported = None @@ -320,6 +321,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) + if self._saving_shapes_in_step(): + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, + step=self.step, + worker=self.worker, + index_writer=self.writer.index_writer, + ) elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): for device, device_string in self.device_map.items(): @@ -329,26 +337,40 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer_map[device_string] = FileWriter( trial_dir=self.out_dir, step=self.step, worker=device_string ) + if self._saving_shapes_in_step(): + self.shape_writer[device_string] = ShapeWriter( + trial_dir=self.out_dir, + step=self.step, + worker=self.worker, + index_writer=self.writer_map[device_string].index_writer, + ) else: # training on CPU when all device strings have cpu if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) + if self._saving_shapes_in_step(): + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, + step=self.step, + worker=self.worker, + index_writer=self.writer.index_writer, + ) + elif self.distribution_strategy == TFDistributionStrategy.NONE: if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) + if self._saving_shapes_in_step(): + self.shape_writer = ShapeWriter( + trial_dir=self.out_dir, + step=self.step, + worker=self.worker, + index_writer=self.writer.index_writer, + ) else: raise NotImplementedError - if self._saving_shapes_in_step(): - if self.shape_writer is None or only_initialize_if_missing is False: - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, step=self.step, worker=self.worker - ) - else: - assert self.shape_writer is None - def _close_writers(self) -> None: if self.dry_run: return diff --git a/smdebug/trials/local_trial.py b/smdebug/trials/local_trial.py index e8cfbe51e..272acff83 100644 --- a/smdebug/trials/local_trial.py +++ b/smdebug/trials/local_trial.py @@ -39,12 +39,6 @@ def __init__( def _get_collection_files(self) -> list: return list_collection_files_in_directory(get_path_to_collections(self.path)) - def _load_tensors_from_index_tensors(self, index_tensors_dict): - for tname in index_tensors_dict: - for step, itds in index_tensors_dict[tname].items(): - for worker in itds: - self._add_tensor(int(step), worker, itds[worker]["tensor_location"]) - def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File self.collection_manager = CollectionManager.load(first_collection_file) diff --git a/smdebug/trials/s3_trial.py b/smdebug/trials/s3_trial.py index c2751fb40..374ecdfad 100644 --- a/smdebug/trials/s3_trial.py +++ b/smdebug/trials/s3_trial.py @@ -56,12 +56,6 @@ def _get_collection_files(self) -> list: ) return collection_files - def _load_tensors_from_index_tensors(self, index_tensors_dict): - for tname in index_tensors_dict: - for step, itds in index_tensors_dict[tname].items(): - for worker in itds: - self._add_tensor(int(step), worker, itds[worker]["tensor_location"]) - def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File key = os.path.join(first_collection_file) diff --git a/smdebug/trials/trial.py b/smdebug/trials/trial.py index 2d527305e..c1c8d9c84 100644 --- a/smdebug/trials/trial.py +++ b/smdebug/trials/trial.py @@ -14,7 +14,7 @@ TRAINING_END_DELAY_REFRESH_DEFAULT, TRAINING_END_DELAY_REFRESH_KEY, ) -from smdebug.core.locations import IndexFileLocationUtils, TensorLocation +from smdebug.core.locations import IndexFileLocationUtils, TensorLocation, TensorShape from smdebug.core.logger import get_logger from smdebug.core.modes import ModeKeys from smdebug.core.reductions import REDUCTIONS_PREFIX, reverse_reduction_tensor_name @@ -231,14 +231,14 @@ def has_tensor(self, tname): self.maybe_refresh(tname) return tname in self._tensors - def _populate_step_dict(self, tensor_object, step_num): - if tensor_object.mode != ModeKeys.GLOBAL: - if tensor_object.mode not in self._mode_to_global: - self._mode_to_global[tensor_object.mode] = {} - if tensor_object.mode_step not in self._mode_to_global[tensor_object.mode]: - self._mode_to_global[tensor_object.mode][tensor_object.mode_step] = int(step_num) + def _populate_step_dict(self, mode, mode_step, step_num): + if mode != ModeKeys.GLOBAL: + if mode not in self._mode_to_global: + self._mode_to_global[mode] = {} + if mode_step not in self._mode_to_global[mode]: + self._mode_to_global[mode][mode_step] = int(step_num) if step_num not in self._global_to_mode: - self._global_to_mode[step_num] = (tensor_object.mode, tensor_object.mode_step) + self._global_to_mode[step_num] = (mode, mode_step) def _populate_workers_for_global_step(self, step, worker) -> None: """ @@ -263,7 +263,7 @@ def _populate_workers_for_global_step(self, step, worker) -> None: self.last_complete_step = step self.logger.debug(f"Populating last completing step to: {step}") - def _populate_global_step_to_tensor_name_map(self, tensor: TensorLocation, step_num) -> None: + def _populate_global_step_to_tensor_name_map(self, tensorname: str, step_num) -> None: """ The self.global_step_to_tensors_map dictionary holds a mapping of step number and a set of all the tensor names that have been written for the step. @@ -274,47 +274,67 @@ def _populate_global_step_to_tensor_name_map(self, tensor: TensorLocation, step_ """ if step_num not in self.global_step_to_tensors_map: self.global_step_to_tensors_map[step_num] = set() - self.global_step_to_tensors_map[step_num].add(tensor.tensorname) + self.global_step_to_tensors_map[step_num].add(tensorname) - def _populate_mode_to_tensor_name_map(self, tensor: TensorLocation) -> None: + def _populate_mode_to_tensor_name_map(self, tensorname, mode) -> None: """ The self.mode_to_tensors_map dictionary holds a mapping of mode and a set of all the tensor names that have been written for the mode. :param tensor: :return: """ - if tensor.mode != ModeKeys.GLOBAL: - if tensor.mode not in self.mode_to_tensors_map: - self.mode_to_tensors_map[tensor.mode] = set() - self.mode_to_tensors_map[tensor.mode].add(tensor.tensorname) + if mode != ModeKeys.GLOBAL: + if mode not in self.mode_to_tensors_map: + self.mode_to_tensors_map[mode] = set() + self.mode_to_tensors_map[mode].add(tensorname) - def _add_tensor(self, step_num, worker, tensor_object: TensorLocation): + def _load_tensors_from_index_tensors(self, index_tensors_dict): + for tname in index_tensors_dict: + for step, itds in index_tensors_dict[tname].items(): + for worker in itds: + self._add_tensor( + int(step), + worker, + itds[worker].get("tensor_location", None), + itds[worker].get("tensor_shape", None), + ) + + def _add_tensor( + self, step_num, worker, tensor_location: TensorLocation, tensor_shape: TensorShape + ): is_reduction = False - if REDUCTIONS_PREFIX in tensor_object.tensorname: - tname, red_name, abs = reverse_reduction_tensor_name(tensor_object.tensorname) - tensor_object.tensorname = tname - is_reduction = True + if tensor_location is not None: + tensorname = tensor_location.tensorname + mode = tensor_location.mode + mode_step = tensor_location.mode_step + elif tensor_shape is not None: + tensorname = tensor_shape.name + mode = tensor_shape.mode + mode_step = tensor_shape.mode_step else: - tname = tensor_object.tensorname + raise RuntimeError("both tensor_location and tensor_shape can't be None") - if tname not in self._tensors: - tensor = Tensor(tname, trial=self, cache=self.cache) - self._tensors[tname] = tensor + if REDUCTIONS_PREFIX in tensorname: + tensorname, red_name, abs = reverse_reduction_tensor_name(tensorname) + is_reduction = True - tensor = self._tensors[tname] + if tensorname not in self._tensors: + tensor = Tensor(tensorname, trial=self, cache=self.cache) + self._tensors[tensorname] = tensor + + tensor = self._tensors[tensorname] if is_reduction: - tensor.add_reduction_step( - tensor_object.mode, tensor_object.mode_step, worker, red_name, abs, tensor_object - ) + tensor.add_reduction_step(mode, mode_step, worker, red_name, abs, tensor_location) else: - tensor.add_step(tensor_object.mode, tensor_object.mode_step, worker, tensor_object) + # shape can only be passed for actual tensor, not reductions + tensor.add_step(mode, mode_step, worker, tensor_location, tensor_shape) - self._populate_step_dict(tensor_object, step_num) - self._populate_global_step_to_tensor_name_map(tensor_object, step_num) + self._populate_step_dict(mode, mode_step, step_num) + self._populate_global_step_to_tensor_name_map(tensorname, step_num) self._populate_workers_for_global_step(step_num, worker) - self._populate_mode_to_tensor_name_map(tensor_object) + self._populate_mode_to_tensor_name_map(tensorname, mode) def _tensors_matching_regex(self, regex_list) -> set: matched_tensornames = set() diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index 7390f335f..5f6f6796a 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -149,7 +149,7 @@ def forward(self, x): hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) - verify_shapes(out_dir, 0, 4) + verify_shapes(out_dir, 0, 41) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py index 2fcb379f6..654d05418 100644 --- a/tests/tensorflow/keras/test_keras.py +++ b/tests/tensorflow/keras/test_keras.py @@ -5,6 +5,7 @@ import pytest import tensorflow as tf from tests.tensorflow.utils import create_trial_fast_refresh +from tests.utils import verify_shapes # First Party from smdebug.core.access_layer import has_training_ended @@ -224,6 +225,20 @@ def test_tf_keras(out_dir): exhaustive_check(out_dir, True) +@pytest.mark.slow # 0:07 to run +def test_tf_keras_shapes(out_dir): + train_model( + out_dir, + save_all=True, + reduction_config=ReductionConfig(save_shape=True), + use_tf_keras=True, + save_config=SaveConfig(save_steps=[0, 10]), + eager=False, + steps=["train", "eval", "predict", "train"], + ) + verify_shapes(out_dir, 0, 21) + + @pytest.mark.slow # 0:03 to run def test_tf_keras_non_keras_opt(out_dir): include_collections = [ diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 6e9d932ec..349281d5c 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_gradtape(trial_dir=out_dir, hook=hook) - verify_shapes(out_dir, 0, 9) - verify_shapes(out_dir, 500, 14) + verify_shapes(out_dir, 0, 10) + verify_shapes(out_dir, 500, 15) @pytest.mark.skip_if_non_eager @@ -473,7 +473,7 @@ def test_keras_fit_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_fit(trial_dir=out_dir, hook=hook) - verify_shapes(out_dir, 0, 9) + verify_shapes(out_dir, 0, 12) @pytest.mark.slow diff --git a/tests/utils.py b/tests/utils.py index 4472cf7bc..298e77d38 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -16,8 +16,8 @@ DEFAULT_WORKER_NAME, TENSORBOARD_CONFIG_FILE_PATH_ENV_STR, ) -from smdebug.core.locations import ShapeFileLocation from smdebug.core.utils import is_s3, remove_file_if_exists +from smdebug.trials import create_trial def use_s3_datasets(): @@ -31,20 +31,12 @@ def use_s3_datasets(): def verify_shapes(out_dir, step_num, num_tensors): - sl = ShapeFileLocation(step_num, DEFAULT_WORKER_NAME) - path = os.path.join(out_dir, sl.get_file_location()) - with open(path) as jsfile: - shape_dict = json.load(jsfile) - print(shape_dict["payload"]) - assert "payload" in shape_dict - assert len(shape_dict["payload"]) == num_tensors, ( - len(shape_dict["payload"]), - shape_dict["payload"], - ) - for ts in shape_dict["payload"]: - for dim in ts["shape"]: - assert isinstance(dim, int) - assert isinstance(ts["name"], str) + trial = create_trial(out_dir) + tnames = trial.tensor_names(step=step_num) + assert num_tensors == len(tnames), (len(tnames), tnames) + for tname in tnames: + tensor = trial.tensor(tname) + assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) class SagemakerSimulator(object): From 681e35c21b5cb4f2108e3d59dcb1d7024f55fb95 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 13 Aug 2020 18:33:14 -0700 Subject: [PATCH 12/40] Add mxnet test --- tests/mxnet/test_hook_reduce_config.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 245a16a80..550f82a1a 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -2,6 +2,9 @@ import shutil from datetime import datetime +# Third Party +from tests.utils import verify_shapes + # First Party from smdebug.mxnet import ReductionConfig, SaveConfig from smdebug.mxnet.hook import Hook as t_hook @@ -24,6 +27,7 @@ def test_save_config(hook=None, out_dir=None): hook = t_hook( out_dir=out_dir, save_config=global_save_config, + save_all=True, include_collections=[ "weights", "biases", @@ -82,6 +86,26 @@ def test_save_config(hook=None, out_dir=None): shutil.rmtree(out_dir) +def test_save_shapes(out_dir, hook=None): + hook_created = False + if hook is None: + hook_created = True + global_reduce_config = ReductionConfig(save_shape=True) + global_save_config = SaveConfig(save_steps=[0, 1]) + + hook = t_hook( + out_dir=out_dir, + save_config=global_save_config, + save_all=True, + reduction_config=global_reduce_config, + ) + run_mnist_gluon_model(hook=hook, num_steps_train=5) + verify_shapes(out_dir, 0, 49) + verify_shapes(out_dir, 1, 49) + if hook_created: + shutil.rmtree(out_dir) + + def test_save_config_hook_from_json(): from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR import os From 5dc47ffa2da577f2066fa90808547ea78ad47c52 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 13 Aug 2020 18:59:57 -0700 Subject: [PATCH 13/40] Add s3 and json tests --- docs/api.md | 1 + .../test_json_configs/test_hook_save_shape.json | 9 +++++++++ tests/pytorch/test_reduce_config.py | 13 +++++++++++++ tests/tensorflow/hooks/test_estimator_modes.py | 5 +++++ 4 files changed, 28 insertions(+) create mode 100644 tests/pytorch/test_json_configs/test_hook_save_shape.json diff --git a/docs/api.md b/docs/api.md index 778cf3e46..3490b14a5 100644 --- a/docs/api.md +++ b/docs/api.md @@ -96,6 +96,7 @@ include_workers include_regex reductions save_raw_tensor +save_shape save_interval save_steps start_step diff --git a/tests/pytorch/test_json_configs/test_hook_save_shape.json b/tests/pytorch/test_json_configs/test_hook_save_shape.json new file mode 100644 index 000000000..166051af4 --- /dev/null +++ b/tests/pytorch/test_json_configs/test_hook_save_shape.json @@ -0,0 +1,9 @@ +{ + "S3Path": "s3://kjndjknd_bucket/prefix", + "LocalPath": "/tmp/test_output/test_hook_save_shape/jsonloading", + "HookParameters": { + "save_all": true, + "save_shape": true, + "save_steps": "0,1" + } + } diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index 5f6f6796a..0f6c26d74 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -154,6 +154,19 @@ def forward(self, x): shutil.rmtree(out_dir) +def test_save_shapes_json(): + from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR + + out_dir = "/tmp/test_output/test_hook_save_shape/jsonloading" + shutil.rmtree(out_dir, True) + os.environ[ + CONFIG_FILE_PATH_ENV_STR + ] = "tests/pytorch/test_json_configs/test_hook_save_shape.json" + hook = t_hook.create_from_json_file() + test_save_shapes(hook=hook, out_dir=out_dir) + shutil.rmtree(out_dir, True) + + # Test creating hook by loading the json file with reduction configs. def test_reduce_config_with_json(): from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py index fe481ace4..c947c8ddf 100644 --- a/tests/tensorflow/hooks/test_estimator_modes.py +++ b/tests/tensorflow/hooks/test_estimator_modes.py @@ -212,6 +212,11 @@ def test_mnist_shapes(out_dir, on_s3=False): verify_shapes(out_dir, 0, 249) +@pytest.mark.slow # 0:02 to run +def test_mnist_shapes_s3(out_dir): + test_mnist_shapes(out_dir, on_s3=True) + + @pytest.mark.slow # 0:02 to run def test_mnist_local_json(out_dir, monkeypatch): monkeypatch.setenv( From c775942c805b0fc1c3bccbe4945a08535142f8f5 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Fri, 14 Aug 2020 15:14:50 -0700 Subject: [PATCH 14/40] lint --- smdebug/core/writer.py | 3 --- tests/utils.py | 1 - 2 files changed, 4 deletions(-) diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 4932e72e0..3ed1f6da5 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -17,11 +17,9 @@ """APIs for logging data in the event file.""" # Standard Library -import json from typing import Tuple # First Party -from smdebug.core.access_layer import TSAccessFile, TSAccessS3 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME from smdebug.core.tfevent.event_file_writer import EventFileWriter from smdebug.core.tfevent.index_file_writer import IndexWriter @@ -34,7 +32,6 @@ scalar_summary, ) from smdebug.core.tfevent.util import make_tensor_proto -from smdebug.core.utils import is_s3 # Local from .locations import ( diff --git a/tests/utils.py b/tests/utils.py index 298e77d38..d087c93a7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,4 @@ # Standard Library -import json import os import shutil from pathlib import Path From 355be0bf58587f5bce296bb6d2387806aa912f63 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 17 Aug 2020 12:00:43 -0700 Subject: [PATCH 15/40] Fix payload --- smdebug/core/index_reader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index 58256dd65..5af1ef157 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -241,8 +241,9 @@ def _update_tensors_from_json( else: event_file_name = None - tensor_payload = index_dict["tensor_payload"] to_update_index_dict = [] + + tensor_payload = index_dict.get("tensor_payload", []) for tensor in tensor_payload: tensor_name = tensor["tensorname"] start_idx = tensor["start_idx"] @@ -252,7 +253,7 @@ def _update_tensors_from_json( ) to_update_index_dict.append((tensor_name, step, tensor_location)) - shape_payload = index_dict["shape_payload"] + shape_payload = index_dict.get("shape_payload", []) for tensor in shape_payload: tensor_name = tensor["tensorname"] original_name = tensor["originalname"] From 3eb0202ef545ab19e247e2df93ad53c392a227ff Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 17 Aug 2020 12:08:22 -0700 Subject: [PATCH 16/40] fix import --- tests/tensorflow/hooks/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index b27589ff6..9803b3ac2 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -1,7 +1,7 @@ # Standard Library # Third Party -from tests.tensorflow2.utils import verify_shapes +from tests.utils import verify_shapes # First Party import smdebug.tensorflow as smd From c14a67ec57f88bb4e1336f874bf80e23fab69090 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 17 Aug 2020 12:41:44 -0700 Subject: [PATCH 17/40] Handle different num tensors for losses --- tests/pytorch/test_reduce_config.py | 3 ++- tests/utils.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index 0f6c26d74..c33baed6c 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -149,7 +149,8 @@ def forward(self, x): hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) - verify_shapes(out_dir, 0, 41) + # different versions seem to output different tensors + verify_shapes(out_dir, 0, 41, exact_equal=False) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/utils.py b/tests/utils.py index d087c93a7..65ed28ced 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -29,10 +29,13 @@ def use_s3_datasets(): return False -def verify_shapes(out_dir, step_num, num_tensors): +def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True): trial = create_trial(out_dir) tnames = trial.tensor_names(step=step_num) - assert num_tensors == len(tnames), (len(tnames), tnames) + if exact_equal: + assert num_tensors == len(tnames), (len(tnames), tnames) + else: + assert num_tensors >= len(tnames), (len(tnames), tnames) for tname in tnames: tensor = trial.tensor(tname) assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) From d12b824bbe7970847978fbe1b510c9011225671e Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 17 Aug 2020 12:56:36 -0700 Subject: [PATCH 18/40] Fix exact equal condition --- tests/pytorch/test_reduce_config.py | 2 +- tests/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index c33baed6c..f3d7b1214 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -149,7 +149,7 @@ def forward(self, x): hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) - # different versions seem to output different tensors + # different versions seem to output different number of loss tensors verify_shapes(out_dir, 0, 41, exact_equal=False) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/utils.py b/tests/utils.py index 65ed28ced..f8bc07159 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -35,7 +35,7 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True): if exact_equal: assert num_tensors == len(tnames), (len(tnames), tnames) else: - assert num_tensors >= len(tnames), (len(tnames), tnames) + assert num_tensors <= len(tnames), (len(tnames), tnames) for tname in tnames: tensor = trial.tensor(tname) assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) From 972d95a71e63b7c0620d10f86f04dd3dc9271546 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 17 Aug 2020 16:27:50 -0700 Subject: [PATCH 19/40] Fix mode bug --- smdebug/core/locations.py | 3 +++ tests/tensorflow/hooks/test_reductions.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 9712f58a2..de99f72de 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -23,6 +23,9 @@ def __init__(self, tname, mode, mode_step, event_file_name, start_idx, length, w def to_dict(self): return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length} + def get_mode(self): + return str(self.mode).split(".")[-1] + class TensorShape: def __init__(self, name, mode, mode_step, shape, original_name=None): diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index 9803b3ac2..1b857fecc 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -70,7 +70,7 @@ def test_shapes(out_dir, save_raw_tensor=False): include_collections=["weights", "gradients", "losses"], ) simple_model(hook) - verify_shapes(out_dir, 0, 2) + verify_shapes(out_dir, 0, 3) def test_reductions_with_raw_tensor(out_dir): From 850cc4446e98e40eca525fad128c2d643ff14d3a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 18 Aug 2020 11:04:22 -0700 Subject: [PATCH 20/40] trigger CI From 2c4479697fb27a5ae2dbb25f6d38f9b124874f66 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 18 Aug 2020 22:10:17 -0700 Subject: [PATCH 21/40] Add support for distributed training with writer map --- smdebug/core/hook.py | 48 ++++++++++++++-------- smdebug/core/locations.py | 3 ++ smdebug/tensorflow/base_hook.py | 52 ++++++++++-------------- tests/tensorflow2/test_keras_mirrored.py | 14 +++++++ tests/utils.py | 14 ++++++- 5 files changed, 81 insertions(+), 50 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 5256c9adf..345ee5317 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -410,6 +410,17 @@ def _prepare_collections(self): self.prepared_collections = True #### End of Save Manager methods #### + @staticmethod + def _close_given_writer_map(writer_dict): + # Delete all the dist training writers + to_delete_writers = [] + for key, writer in writer_dict.items(): + # close calls flush + writer.close() + to_delete_writers.append(key) + + for key in to_delete_writers: + del writer_dict[key] def _close_writers(self) -> None: if self.dry_run: @@ -423,15 +434,7 @@ def _close_writers(self) -> None: self.writer.close() self.writer = None - to_delete_writers = [] - # Delete all the tb writers - for mode, writer in self.tb_writers.items(): - if writer is not None: - writer.flush() - writer.close() - to_delete_writers.append(mode) - for mode in to_delete_writers: - del self.tb_writers[mode] + self._close_given_writer_map(self.tb_writers) if self.shape_writer is not None: self.shape_writer.close() @@ -474,7 +477,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: index_writer=self.writer.index_writer, ) - def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: + def _get_single_process_writers(self, shape_writers=False) -> List[FileWriter]: + if shape_writers is False: + return [self.writer] if self.writer else [] + else: + return [self.shape_writer] if self.shape_writer else [] + + def _get_writers(self, tensor_name, tensor_ref=None, shape_writers=False) -> List[FileWriter]: """ :param tensor_name: :param tensor_ref: used by TF @@ -482,7 +491,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ if self.save_all_workers is False and self.worker != self.chief_worker: return [] - return [self.writer] if self.writer else [] + return self._get_single_process_writers(shape_writers) def _maybe_get_tb_writer(self) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. @@ -745,6 +754,7 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_ break def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None): + shape_writers = self._get_writers(tensor_name, tensor_ref=tensor_ref, shape_writers=True) for s_col in save_collections: reduction_config = s_col.reduction_config if self.dry_run is False and reduction_config.save_shape is True: @@ -754,13 +764,15 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N original_name = tensor_ref.tf_obj.name else: original_name = None - self.shape_writer.write_shape( - tensor_name, - this_shape, - self.mode, - self.mode_steps[self.mode], - original_name=original_name, - ) + + for writer in shape_writers: + writer.write_shape( + tensor_name, + this_shape, + self.mode, + self.mode_steps[self.mode], + original_name=original_name, + ) break def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None): diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index de99f72de..03c389224 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -40,6 +40,9 @@ def __init__(self, name, mode, mode_step, shape, original_name=None): def to_dict(self): return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape} + def get_mode(self): + return str(self.mode).split(".")[-1] + STEP_NUMBER_FORMATTING_LENGTH = "012" diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 02c69b36f..ad6557ade 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -262,7 +262,7 @@ def _set_chief_worker(self): elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED: raise NotImplementedError - def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: + def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[FileWriter]: """ For tensors generated during distributed tf jobs, we map the tensor to a writer with its device attribute. @@ -278,8 +278,8 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: TFDistributionStrategy.PARAMETER_SERVER, TFDistributionStrategy.HOROVOD, ]: - if (self.save_all_workers is True or self.worker == self.chief_worker) and self.writer: - return [self.writer] + if self.save_all_workers is True or self.worker == self.chief_worker: + return self._get_single_process_writers(shape_writers) elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): # else is for metrics in Keras @@ -290,17 +290,25 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: # if device str is empty or cpu in worker if not bool(worker) or "CPU" in worker: if self.save_all_workers: - return list(self.writer_map.values()) + if shape_writers is False: + return list(self.writer_map.values()) + else: + return list(self.shape_writer_map.values()) else: - return [self.writer_map[self.device_map[self.chief_worker]]] + if shape_writers is False: + return [self.writer_map[self.device_map[self.chief_worker]]] + else: + return [self.shape_writer_map[self.device_map[self.chief_worker]]] elif self.save_all_workers or worker == self.chief_worker: - return [self.writer_map[self.device_map[worker]]] - elif self.writer: + if shape_writers is False: + return [self.writer_map[self.device_map[worker]]] + else: + return [self.shape_writer_map[self.device_map[worker]]] + else: # training on CPU when all device strings have cpu - return [self.writer] + return self._get_single_process_writers(shape_writers) elif self.distribution_strategy == TFDistributionStrategy.NONE: - if self.writer: - return [self.writer] + return self._get_single_process_writers(shape_writers) else: raise NotImplementedError # when self.writer is None, returns empty list @@ -338,7 +346,7 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: trial_dir=self.out_dir, step=self.step, worker=device_string ) if self._saving_shapes_in_step(): - self.shape_writer[device_string] = ShapeWriter( + self.shape_writer_map[device_string] = ShapeWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker, @@ -383,25 +391,9 @@ def _close_writers(self) -> None: self.writer.close() self.writer = None - # Delete all the dist training writers - to_delete_writers = [] - for device, writer in self.writer_map.items(): - writer.flush() - writer.close() - to_delete_writers.append(device) - - for device in to_delete_writers: - del self.writer_map[device] - - to_delete_writers = [] - # Delete all the tb writers - for mode, writer in self.tb_writers.items(): - if writer is not None: - writer.flush() - writer.close() - to_delete_writers.append(mode) - for mode in to_delete_writers: - del self.tb_writers[mode] + self._close_given_writer_map(self.writer_map) + self._close_given_writer_map(self.shape_writer_map) + self._close_given_writer_map(self.tb_writers) if self.shape_writer is not None: self.shape_writer.close() diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index d857218ab..c68bffdff 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -13,6 +13,7 @@ from tests.core.utils import verify_files from tests.tensorflow2.utils import is_tf_2_2, is_tf_2_3 from tests.tensorflow.utils import create_trial_fast_refresh +from tests.utils import verify_shapes # First Party import smdebug.tensorflow as smd @@ -290,6 +291,19 @@ def test_save_all(out_dir, tf_eager_mode, workers): verify_files(out_dir, save_config, saved_scalars) +@pytest.mark.slow +def test_shapes(out_dir, tf_eager_mode): + train_model( + out_dir, + save_all=True, + save_config=SaveConfig(save_steps=[0]), + reduction_config=ReductionConfig(save_shape=True), + steps=["train"], + eager=tf_eager_mode, + ) + verify_shapes(out_dir, 0, 15, multiworker=True) + + @pytest.mark.slow def test_base_reductions(out_dir, tf_eager_mode): train_model( diff --git a/tests/utils.py b/tests/utils.py index f8bc07159..ab3fb703b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -29,7 +29,7 @@ def use_s3_datasets(): return False -def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True): +def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False): trial = create_trial(out_dir) tnames = trial.tensor_names(step=step_num) if exact_equal: @@ -38,7 +38,17 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True): assert num_tensors <= len(tnames), (len(tnames), tnames) for tname in tnames: tensor = trial.tensor(tname) - assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) + if multiworker is False: + assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) + else: + workers = tensor.workers(step_num) + assert len(workers) > 1 + for w in workers: + assert isinstance(tensor.shape(step_num, worker=w), tuple), ( + tname, + w, + tensor.shape(step_num, worker=w), + ) class SagemakerSimulator(object): From 1b09b8e40dcea0a6a6052941d2c018ed5e591a56 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 18 Aug 2020 22:48:44 -0700 Subject: [PATCH 22/40] Check that value throws exception --- smdebug/core/index_reader.py | 81 +++++++++++++----------------------- smdebug/core/tensor.py | 1 + tests/utils.py | 25 +++++++++++ 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index 5af1ef157..525d8608e 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -120,12 +120,22 @@ def fetch_tensor_value(self, tensor_location: TensorLocation): def list_event_files(self, start_after_prefix): pass - @abstractmethod def load_tensor_data_from_index_files( self, start_after_key=None, range_steps=None ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]: """Return a triply nested dict referring to tensor data.""" + responses, steps, last_index_token, workers = self.read_index_files( + start_after_key, range_steps + ) + + tensor_data = {} + for step, response, worker in zip(steps, responses, workers): + tensor_data = self._update_tensors_from_json( + tensor_data, step, response, self.path, worker + ) + return tensor_data, last_index_token + @abstractmethod def _is_event_file_present(self, file_name) -> bool: pass @@ -236,30 +246,26 @@ def _update_tensors_from_json( mode = ModeKeys[mode.strip()] mode_step = index_meta["mode_step"] - if "event_file_name" in index_meta: - event_file_name = os.path.join(path, index_meta["event_file_name"]) - else: - event_file_name = None - to_update_index_dict = [] - tensor_payload = index_dict.get("tensor_payload", []) - for tensor in tensor_payload: - tensor_name = tensor["tensorname"] - start_idx = tensor["start_idx"] - length = tensor["length"] - tensor_location = TensorLocation( - tensor_name, mode, mode_step, event_file_name, start_idx, length, worker - ) - to_update_index_dict.append((tensor_name, step, tensor_location)) + if "tensor_payload" in index_dict: + event_file_name = os.path.join(path, index_meta["event_file_name"]) + for tensor in index_dict["tensor_payload"]: + tensor_name = tensor["tensorname"] + start_idx = tensor["start_idx"] + length = tensor["length"] + tensor_location = TensorLocation( + tensor_name, mode, mode_step, event_file_name, start_idx, length, worker + ) + to_update_index_dict.append((tensor_name, step, tensor_location)) - shape_payload = index_dict.get("shape_payload", []) - for tensor in shape_payload: - tensor_name = tensor["tensorname"] - original_name = tensor["originalname"] - shape = tensor["shape"] - ts = TensorShape(tensor_name, mode, mode_step, shape, original_name) - to_update_index_dict.append((tensor_name, step, ts)) + if "shape_payload" in index_dict: + for tensor in index_dict["shape_payload"]: + tensor_name = tensor["tensorname"] + original_name = tensor["originalname"] + shape = tensor["shape"] + ts = TensorShape(tensor_name, mode, mode_step, shape, original_name) + to_update_index_dict.append((tensor_name, step, ts)) for tu in to_update_index_dict: tensor_name, step, obj = tu @@ -304,22 +310,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray: tensor_name, step, tensor_data, mode, mode_step = tensor_tuple return tensor_data - def load_tensor_data_from_index_files( - self, start_after_key=None, range_steps=None - ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]: - """Return a triply nested dict referring to tensor data.""" - - responses, steps, last_index_token, workers = self.read_index_files( - start_after_key, range_steps - ) - - tensor_data = {} - for step, response, worker in zip(steps, responses, workers): - tensor_data = self._update_tensors_from_json( - tensor_data, step, response, self.path, worker - ) - return tensor_data, last_index_token - def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: @@ -417,21 +407,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray: tensor_name, step, tensor_data, mode, mode_step = tensor_tuple return tensor_data - def load_tensor_data_from_index_files( - self, start_after_key=None, range_steps=None - ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]: - """Return a triply nested dict referring to tensor data.""" - - responses, steps, last_index_token, workers = self.read_index_files( - start_after_key, range_steps - ) - tensor_data = {} - for step, response, worker in zip(steps, responses, workers): - tensor_data = self._update_tensors_from_json( - tensor_data, step, response, self.path, worker - ) - return tensor_data, last_index_token - def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py index de52f7858..db2098c3f 100644 --- a/smdebug/core/tensor.py +++ b/smdebug/core/tensor.py @@ -266,6 +266,7 @@ def values(self, mode=ModeKeys.GLOBAL, worker=None): def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None): # step refreshes s = self._step(step_num=step_num, mode=mode, worker=worker) + print(s, s.value) if s.value is not None: return s.value elif s.location is not None: diff --git a/tests/utils.py b/tests/utils.py index ab3fb703b..555dc8719 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,6 +5,7 @@ # Third Party import boto3 +import numpy as np from tests.constants import TEST_DATASET_S3_PATH # First Party @@ -16,6 +17,7 @@ TENSORBOARD_CONFIG_FILE_PATH_ENV_STR, ) from smdebug.core.utils import is_s3, remove_file_if_exists +from smdebug.exceptions import TensorUnavailableForStep from smdebug.trials import create_trial @@ -29,6 +31,15 @@ def use_s3_datasets(): return False +def is_scalar(x): + if isinstance(x, list): + if len(x) == 1: + return True + elif isinstance(x, np.ndarray): + return True + return False + + def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False): trial = create_trial(out_dir) tnames = trial.tensor_names(step=step_num) @@ -40,10 +51,24 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker= tensor = trial.tensor(tname) if multiworker is False: assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) + try: + if not is_scalar(tensor.value(step_num)): + # test did not save value except scalars which dont use reduction config + # so it should raise the below exception + assert False + except TensorUnavailableForStep: + pass else: workers = tensor.workers(step_num) assert len(workers) > 1 for w in workers: + try: + if not is_scalar(tensor.value(step_num, worker=w)): + # test did not save value so it should raise the below exception + assert False + except TensorUnavailableForStep: + pass + assert isinstance(tensor.shape(step_num, worker=w), tuple), ( tname, w, From f4106f33a2a6c3c81dda817a4f4759aee4c2f127 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 13:00:06 -0700 Subject: [PATCH 23/40] Fix tests to make them more resilient --- smdebug/core/tensor.py | 1 - tests/mxnet/test_hook_reduce_config.py | 4 ++-- tests/tensorflow/hooks/test_estimator_modes.py | 11 ++++++++++- tests/tensorflow/hooks/test_reductions.py | 6 +++++- tests/tensorflow/keras/test_keras.py | 4 +++- tests/tensorflow2/test_keras.py | 7 ++++--- tests/tensorflow2/test_keras_mirrored.py | 12 +++++++++++- tests/utils.py | 13 ++++--------- 8 files changed, 39 insertions(+), 19 deletions(-) diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py index db2098c3f..de52f7858 100644 --- a/smdebug/core/tensor.py +++ b/smdebug/core/tensor.py @@ -266,7 +266,6 @@ def values(self, mode=ModeKeys.GLOBAL, worker=None): def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None): # step refreshes s = self._step(step_num=step_num, mode=mode, worker=worker) - print(s, s.value) if s.value is not None: return s.value elif s.location is not None: diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 550f82a1a..898776d87 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -100,8 +100,8 @@ def test_save_shapes(out_dir, hook=None): reduction_config=global_reduce_config, ) run_mnist_gluon_model(hook=hook, num_steps_train=5) - verify_shapes(out_dir, 0, 49) - verify_shapes(out_dir, 1, 49) + verify_shapes(out_dir, 0, 40, exact_equal=False) + verify_shapes(out_dir, 1, 40, exact_equal=False) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py index c947c8ddf..62a4f114e 100644 --- a/tests/tensorflow/hooks/test_estimator_modes.py +++ b/tests/tensorflow/hooks/test_estimator_modes.py @@ -18,6 +18,7 @@ import pytest import tensorflow as tf from tests.analysis.utils import delete_s3_prefix +from tests.tensorflow.utils import create_trial_fast_refresh from tests.utils import verify_shapes # First Party @@ -209,7 +210,15 @@ def test_mnist_shapes(out_dir, on_s3=False): steps=None, reduction_config=smd.ReductionConfig(save_shape=True), ) - verify_shapes(out_dir, 0, 249) + verify_shapes( + out_dir, + 0, + [ + "conv2d/kernel:0", + "gradients/sparse_softmax_cross_entropy_loss/value_grad/Sum:0", + "dense_1/kernel:0", + ], + ) @pytest.mark.slow # 0:02 to run diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index 1b857fecc..3cb18624c 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -70,7 +70,11 @@ def test_shapes(out_dir, save_raw_tensor=False): include_collections=["weights", "gradients", "losses"], ) simple_model(hook) - verify_shapes(out_dir, 0, 3) + verify_shapes( + out_dir, + 0, + ["foobar/weight1:0", "gradients/MatMul_grad/tuple/control_dependency_1:0", "loss:0"], + ) def test_reductions_with_raw_tensor(out_dir): diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py index 654d05418..a122722be 100644 --- a/tests/tensorflow/keras/test_keras.py +++ b/tests/tensorflow/keras/test_keras.py @@ -236,7 +236,9 @@ def test_tf_keras_shapes(out_dir): eager=False, steps=["train", "eval", "predict", "train"], ) - verify_shapes(out_dir, 0, 21) + verify_shapes( + out_dir, 0, ["training/RMSprop/momentum:0", "dense/weights/dense/kernel:0", "batch", "loss"] + ) @pytest.mark.slow # 0:03 to run diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index 349281d5c..cc676eb20 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_gradtape(trial_dir=out_dir, hook=hook) - verify_shapes(out_dir, 0, 10) - verify_shapes(out_dir, 500, 15) + verify_shapes(out_dir, 0, ["gradients/dense_1/biasGrad", "weights/dense/bias:0", "loss"]) + verify_shapes(out_dir, 500, ["weights/dense/bias:0", "Adam/learning_rate:0", "loss"]) @pytest.mark.skip_if_non_eager @@ -473,7 +473,8 @@ def test_keras_fit_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_fit(trial_dir=out_dir, hook=hook) - verify_shapes(out_dir, 0, 12) + print(create_trial_fast_refresh(out_dir).tensor_names(step=0)) + verify_shapes(out_dir, 0, ["dense/weights/dense/kernel:0", "accuracy", "Adam/beta_1:0"]) @pytest.mark.slow diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index c68bffdff..c49b65e3d 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -301,7 +301,17 @@ def test_shapes(out_dir, tf_eager_mode): steps=["train"], eager=tf_eager_mode, ) - verify_shapes(out_dir, 0, 15, multiworker=True) + verify_shapes( + out_dir, + 0, + [ + "dense_1/weights/dense_1/kernel:0", + "scalar/foobar", + "dense/weights/dense/bias:0", + "Adam/decay:0", + ], + multiworker=True, + ) @pytest.mark.slow diff --git a/tests/utils.py b/tests/utils.py index 555dc8719..98e1091f1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,6 +7,7 @@ import boto3 import numpy as np from tests.constants import TEST_DATASET_S3_PATH +from tests.tensorflow.utils import create_trial_fast_refresh # First Party from smdebug.core.config_constants import ( @@ -18,7 +19,6 @@ ) from smdebug.core.utils import is_s3, remove_file_if_exists from smdebug.exceptions import TensorUnavailableForStep -from smdebug.trials import create_trial def use_s3_datasets(): @@ -40,14 +40,9 @@ def is_scalar(x): return False -def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False): - trial = create_trial(out_dir) - tnames = trial.tensor_names(step=step_num) - if exact_equal: - assert num_tensors == len(tnames), (len(tnames), tnames) - else: - assert num_tensors <= len(tnames), (len(tnames), tnames) - for tname in tnames: +def verify_shapes(out_dir, step_num, tensornames, multiworker=False): + trial = create_trial_fast_refresh(out_dir) + for tname in tensornames: tensor = trial.tensor(tname) if multiworker is False: assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) From 78b67d615b3a5604f6d09d4a12d7524dbba3d450 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 13:05:57 -0700 Subject: [PATCH 24/40] Fix mxnet and pytorch tests --- tests/mxnet/test_hook_reduce_config.py | 26 ++++++++++++++++++++++++-- tests/pytorch/test_reduce_config.py | 4 +++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 898776d87..3f8aa11d1 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -100,8 +100,30 @@ def test_save_shapes(out_dir, hook=None): reduction_config=global_reduce_config, ) run_mnist_gluon_model(hook=hook, num_steps_train=5) - verify_shapes(out_dir, 0, 40, exact_equal=False) - verify_shapes(out_dir, 1, 40, exact_equal=False) + verify_shapes( + out_dir, + 0, + [ + "dense0_relu_input_0", + "dense0_relu_output_0", + "pool1_output_0", + "gradient/dense0_weight", + "conv0_weight", + "softmaxcrossentropyloss0_input_0", + ], + ) + verify_shapes( + out_dir, + 1, + [ + "dense0_relu_input_0", + "dense0_relu_output_0", + "pool1_output_0", + "gradient/dense0_weight", + "conv0_weight", + "softmaxcrossentropyloss0_input_0", + ], + ) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index f3d7b1214..633770cf6 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -150,7 +150,9 @@ def forward(self, x): optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) # different versions seem to output different number of loss tensors - verify_shapes(out_dir, 0, 41, exact_equal=False) + verify_shapes( + out_dir, 0, ["conv2_input_0", "NestedNet_fc1.bias", "gradient/NestedNet_conv2.weight"] + ) if hook_created: shutil.rmtree(out_dir) From 2515a2db58b7082e764991ce9725074d68d7cf3d Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 13:15:28 -0700 Subject: [PATCH 25/40] Remove tensor names --- tests/mxnet/test_hook_reduce_config.py | 26 ++----------------- tests/pytorch/test_reduce_config.py | 4 +-- .../tensorflow/hooks/test_estimator_modes.py | 10 +------ tests/tensorflow/hooks/test_reductions.py | 6 +---- tests/tensorflow/keras/test_keras.py | 4 +-- tests/tensorflow2/test_keras.py | 4 +-- tests/tensorflow2/test_keras_mirrored.py | 12 +-------- tests/utils.py | 4 +-- 8 files changed, 11 insertions(+), 59 deletions(-) diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 3f8aa11d1..1ef2be4a5 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -100,30 +100,8 @@ def test_save_shapes(out_dir, hook=None): reduction_config=global_reduce_config, ) run_mnist_gluon_model(hook=hook, num_steps_train=5) - verify_shapes( - out_dir, - 0, - [ - "dense0_relu_input_0", - "dense0_relu_output_0", - "pool1_output_0", - "gradient/dense0_weight", - "conv0_weight", - "softmaxcrossentropyloss0_input_0", - ], - ) - verify_shapes( - out_dir, - 1, - [ - "dense0_relu_input_0", - "dense0_relu_output_0", - "pool1_output_0", - "gradient/dense0_weight", - "conv0_weight", - "softmaxcrossentropyloss0_input_0", - ], - ) + verify_shapes(out_dir, 0) + verify_shapes(out_dir, 1) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py index 633770cf6..de97b30da 100644 --- a/tests/pytorch/test_reduce_config.py +++ b/tests/pytorch/test_reduce_config.py @@ -150,9 +150,7 @@ def forward(self, x): optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device("cpu"), optimizer, num_steps=10) # different versions seem to output different number of loss tensors - verify_shapes( - out_dir, 0, ["conv2_input_0", "NestedNet_fc1.bias", "gradient/NestedNet_conv2.weight"] - ) + verify_shapes(out_dir, 0) if hook_created: shutil.rmtree(out_dir) diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py index 62a4f114e..b07ab1914 100644 --- a/tests/tensorflow/hooks/test_estimator_modes.py +++ b/tests/tensorflow/hooks/test_estimator_modes.py @@ -210,15 +210,7 @@ def test_mnist_shapes(out_dir, on_s3=False): steps=None, reduction_config=smd.ReductionConfig(save_shape=True), ) - verify_shapes( - out_dir, - 0, - [ - "conv2d/kernel:0", - "gradients/sparse_softmax_cross_entropy_loss/value_grad/Sum:0", - "dense_1/kernel:0", - ], - ) + verify_shapes(out_dir, 0) @pytest.mark.slow # 0:02 to run diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index 3cb18624c..e009f4565 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -70,11 +70,7 @@ def test_shapes(out_dir, save_raw_tensor=False): include_collections=["weights", "gradients", "losses"], ) simple_model(hook) - verify_shapes( - out_dir, - 0, - ["foobar/weight1:0", "gradients/MatMul_grad/tuple/control_dependency_1:0", "loss:0"], - ) + verify_shapes(out_dir, 0) def test_reductions_with_raw_tensor(out_dir): diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py index a122722be..bfd5e7cc7 100644 --- a/tests/tensorflow/keras/test_keras.py +++ b/tests/tensorflow/keras/test_keras.py @@ -236,9 +236,7 @@ def test_tf_keras_shapes(out_dir): eager=False, steps=["train", "eval", "predict", "train"], ) - verify_shapes( - out_dir, 0, ["training/RMSprop/momentum:0", "dense/weights/dense/kernel:0", "batch", "loss"] - ) + verify_shapes(out_dir, 0) @pytest.mark.slow # 0:03 to run diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index cc676eb20..ffa5c9ae8 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir): reduction_config=ReductionConfig(save_shape=True), ) helper_keras_gradtape(trial_dir=out_dir, hook=hook) - verify_shapes(out_dir, 0, ["gradients/dense_1/biasGrad", "weights/dense/bias:0", "loss"]) - verify_shapes(out_dir, 500, ["weights/dense/bias:0", "Adam/learning_rate:0", "loss"]) + verify_shapes(out_dir, 0) + verify_shapes(out_dir, 500) @pytest.mark.skip_if_non_eager diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index c49b65e3d..3d9ab247d 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -301,17 +301,7 @@ def test_shapes(out_dir, tf_eager_mode): steps=["train"], eager=tf_eager_mode, ) - verify_shapes( - out_dir, - 0, - [ - "dense_1/weights/dense_1/kernel:0", - "scalar/foobar", - "dense/weights/dense/bias:0", - "Adam/decay:0", - ], - multiworker=True, - ) + verify_shapes(out_dir, 0, multiworker=True) @pytest.mark.slow diff --git a/tests/utils.py b/tests/utils.py index 98e1091f1..af827f264 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -40,9 +40,9 @@ def is_scalar(x): return False -def verify_shapes(out_dir, step_num, tensornames, multiworker=False): +def verify_shapes(out_dir, step_num, multiworker=False): trial = create_trial_fast_refresh(out_dir) - for tname in tensornames: + for tname in trial.tensor_names(step=step_num): tensor = trial.tensor(tname) if multiworker is False: assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num)) From 7f3ea4e57b497ea4b15fc60b9eed9f2ea2667b60 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Wed, 19 Aug 2020 13:47:48 -0700 Subject: [PATCH 26/40] pre-commmit --- tests/tensorflow/hooks/test_estimator_modes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py index b07ab1914..b7de19d6e 100644 --- a/tests/tensorflow/hooks/test_estimator_modes.py +++ b/tests/tensorflow/hooks/test_estimator_modes.py @@ -18,7 +18,6 @@ import pytest import tensorflow as tf from tests.analysis.utils import delete_s3_prefix -from tests.tensorflow.utils import create_trial_fast_refresh from tests.utils import verify_shapes # First Party From cdf6578b21f20a8ff33fb138337a768bbd773227 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 15:23:07 -0700 Subject: [PATCH 27/40] Fix get_mode --- smdebug/core/index_reader.py | 4 ++-- smdebug/core/locations.py | 6 ------ smdebug/core/writer.py | 2 +- tests/mxnet/test_hook_reduce_config.py | 26 +++++++++++--------------- 4 files changed, 14 insertions(+), 24 deletions(-) diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index 525d8608e..dfc8eb0d2 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -248,7 +248,7 @@ def _update_tensors_from_json( to_update_index_dict = [] - if "tensor_payload" in index_dict: + if len(index_dict["tensor_payload"]): event_file_name = os.path.join(path, index_meta["event_file_name"]) for tensor in index_dict["tensor_payload"]: tensor_name = tensor["tensorname"] @@ -259,7 +259,7 @@ def _update_tensors_from_json( ) to_update_index_dict.append((tensor_name, step, tensor_location)) - if "shape_payload" in index_dict: + if len(index_dict["shape_payload"]): for tensor in index_dict["shape_payload"]: tensor_name = tensor["tensorname"] original_name = tensor["originalname"] diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 03c389224..9712f58a2 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -23,9 +23,6 @@ def __init__(self, tname, mode, mode_step, event_file_name, start_idx, length, w def to_dict(self): return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length} - def get_mode(self): - return str(self.mode).split(".")[-1] - class TensorShape: def __init__(self, name, mode, mode_step, shape, original_name=None): @@ -40,9 +37,6 @@ def __init__(self, name, mode, mode_step, shape, original_name=None): def to_dict(self): return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape} - def get_mode(self): - return str(self.mode).split(".")[-1] - STEP_NUMBER_FORMATTING_LENGTH = "012" diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 3ed1f6da5..3966da82b 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -260,7 +260,7 @@ def write_shape( self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None ): self._index_writer.add_shape( - TensorShape(name, mode, mode_step, shape, original_name=original_name) + TensorShape(name, mode.name, mode_step, shape, original_name=original_name) ) def flush(self): diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 1ef2be4a5..46476414d 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -86,24 +86,20 @@ def test_save_config(hook=None, out_dir=None): shutil.rmtree(out_dir) -def test_save_shapes(out_dir, hook=None): - hook_created = False - if hook is None: - hook_created = True - global_reduce_config = ReductionConfig(save_shape=True) - global_save_config = SaveConfig(save_steps=[0, 1]) - - hook = t_hook( - out_dir=out_dir, - save_config=global_save_config, - save_all=True, - reduction_config=global_reduce_config, - ) +def test_save_shapes(out_dir): + global_reduce_config = ReductionConfig(save_shape=True) + global_save_config = SaveConfig(save_steps=[0, 1]) + + hook = t_hook( + out_dir=out_dir, + save_config=global_save_config, + save_all=True, + reduction_config=global_reduce_config, + ) run_mnist_gluon_model(hook=hook, num_steps_train=5) verify_shapes(out_dir, 0) verify_shapes(out_dir, 1) - if hook_created: - shutil.rmtree(out_dir) + shutil.rmtree(out_dir) def test_save_config_hook_from_json(): From d16d1de0b523b5f778a891f31fb0d847292d8272 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 16:07:27 -0700 Subject: [PATCH 28/40] Fix bug with old index files --- smdebug/core/index_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py index dfc8eb0d2..2c182d0d0 100644 --- a/smdebug/core/index_reader.py +++ b/smdebug/core/index_reader.py @@ -248,7 +248,7 @@ def _update_tensors_from_json( to_update_index_dict = [] - if len(index_dict["tensor_payload"]): + if "tensor_payload" in index_dict and len(index_dict["tensor_payload"]): event_file_name = os.path.join(path, index_meta["event_file_name"]) for tensor in index_dict["tensor_payload"]: tensor_name = tensor["tensorname"] @@ -259,7 +259,7 @@ def _update_tensors_from_json( ) to_update_index_dict.append((tensor_name, step, tensor_location)) - if len(index_dict["shape_payload"]): + if "shape_payload" in index_dict and len(index_dict["shape_payload"]): for tensor in index_dict["shape_payload"]: tensor_name = tensor["tensorname"] original_name = tensor["originalname"] From 384b71c8a46069af79b46564c3288c58a62f7020 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 18:17:08 -0700 Subject: [PATCH 29/40] Fix keras test with names of tensors --- tests/tensorflow2/test_keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index ffa5c9ae8..3df832404 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -474,7 +474,7 @@ def test_keras_fit_shapes(out_dir): ) helper_keras_fit(trial_dir=out_dir, hook=hook) print(create_trial_fast_refresh(out_dir).tensor_names(step=0)) - verify_shapes(out_dir, 0, ["dense/weights/dense/kernel:0", "accuracy", "Adam/beta_1:0"]) + verify_shapes(out_dir, 0) @pytest.mark.slow From cd8a4d15f79405f4df1655d3424ad31e3de646b0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 18:43:11 -0700 Subject: [PATCH 30/40] Set original name to None if tf_obj is None --- smdebug/core/hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 345ee5317..758782620 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -760,7 +760,7 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N if self.dry_run is False and reduction_config.save_shape is True: numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) - if tensor_ref is not None: + if tensor_ref is not None and tensor_ref.tf_obj is not None: original_name = tensor_ref.tf_obj.name else: original_name = None From c4881b7541d62b276edebc97f9d9b8f0044ccfac Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 19 Aug 2020 19:33:31 -0700 Subject: [PATCH 31/40] Fix mirrored test for cpu --- tests/tensorflow2/test_keras_mirrored.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index 3d9ab247d..3f0bafe4f 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -293,7 +293,7 @@ def test_save_all(out_dir, tf_eager_mode, workers): @pytest.mark.slow def test_shapes(out_dir, tf_eager_mode): - train_model( + strategy, _ = train_model( out_dir, save_all=True, save_config=SaveConfig(save_steps=[0]), @@ -301,7 +301,8 @@ def test_shapes(out_dir, tf_eager_mode): steps=["train"], eager=tf_eager_mode, ) - verify_shapes(out_dir, 0, multiworker=True) + multiworker = strategy.num_replicas_in_sync > 1 + verify_shapes(out_dir, 0, multiworker=multiworker) @pytest.mark.slow From dd434c6f2b75bc75a13f66c6ed8ef5632858fd57 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 31 Aug 2020 17:06:09 -0700 Subject: [PATCH 32/40] Add docs --- docs/analysis.md | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/docs/analysis.md b/docs/analysis.md index e384ee4cc..d00a13c64 100644 --- a/docs/analysis.md +++ b/docs/analysis.md @@ -30,8 +30,10 @@ This page describes the programming model that SageMaker Debugger provides for y * [steps](#steps-1) * [value](#value) * [reduction_value](#reduction_value) - * [reduction_values](#reduction_values) + * [shape](#shape) * [values](#values) + * [reduction_values](#reduction_values) + * [shapes](#shapes) * [workers](#workers-1) * [prev_steps](#prev_steps) * [Rules](#Rules) @@ -356,6 +358,34 @@ trial.tensor(name).reduction_value(step_num, reduction_name, ###### Returns `numpy.ndarray` The reduction value of tensor at the given step and worker (if the training job saved data from multiple workers) as a 1x1 numpy array. If this reduction was saved for the tensor during training as part of specification through reduction config, it will be loaded and returned. If the given reduction was not saved then, but the full tensor was saved, the reduction will be computed on the fly and returned. If both the chosen reduction and full tensor are not available, this method raises `TensorUnavailableForStep` exception. +#### shape +Get the shape of the chosen tensor at a particular step. + +```python +trial.tensor(name).shape(step_num, mode=modes.GLOBAL, worker=None) + +``` +###### Arguments +- `step_num (int)` The step number whose value is to be returned for the mode passed through the next parameter. +- `mode (smdebug.modes enum value)` The mode applicable for the step number passed above. Defaults to `modes.GLOBAL` +- `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)` + +###### Returns +`tuple(int)` If only the shape of this tensor was saved through `save_shape` configuration in ReductionConfig, it will be returned. If the full tensor was saved, then shape will be computed and returned today. If both the shape and full tensor are not available, this method raises `TensorUnavailableForStep` exception. + +#### values +Get the values of the tensor for all steps of a given mode. + +```python +trial.tensor(name).values(mode=modes.GLOBAL, worker=None) +``` + +###### Arguments +- `mode (smdebug.modes enum value)` The mode applicable for the step number passed above. Defaults to `modes.GLOBAL` +- `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)` + +###### Returns +`dict[int -> numpy.ndarray]` A dictionary with step numbers as keys and numpy arrays representing the value of the tensor as values. #### reduction_values Get all reduction values saved for the chosen tensor at a particular step. A reduction value is a tensor reduced to a single value through reduction or aggregation operations. Please go through the description of the method `reduction_value` for more details. @@ -372,11 +402,11 @@ trial.tensor(name).reduction_values(step_num, mode=modes.GLOBAL, worker=None) ###### Returns `dict[(str, bool) -> numpy.ndarray]` A dictionary with keys being tuples of the form `(reduction_name, abs)` to a 1x1 numpy ndarray value. `abs` here is a boolean that denotes whether the reduction was performed on the absolute value of the tensor or not. Note that this method only returns the reductions which were saved from the training job. It does not compute all known reductions and return them if only the raw tensor was saved. -#### values -Get the values of the tensor for all steps of a given mode. +#### shapes +Get the shapes of the tensor for all steps of a given mode. ```python -trial.tensor(name).values(mode=modes.GLOBAL, worker=None) +trial.tensor(name).shapes(mode=modes.GLOBAL, worker=None) ``` ###### Arguments @@ -384,7 +414,7 @@ trial.tensor(name).values(mode=modes.GLOBAL, worker=None) - `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)` ###### Returns -`dict[int -> numpy.ndarray]` A dictionary with step numbers as keys and numpy arrays representing the value of the tensor as values. +`dict[int -> tuple(int)]` A dictionary with step numbers as keys and tuples of ints representing the shapes of the tensor as values. #### workers Get all the workers for which this tensor was saved at a given step From 4fe8df0e7159a0c217cce19e0916c1ed3b129ac2 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 1 Sep 2020 11:47:31 -0700 Subject: [PATCH 33/40] trigger CI From fa664d35d29905901fd3a05b7dae61ed6c295ae2 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 1 Sep 2020 15:25:23 -0700 Subject: [PATCH 34/40] Fix shape writer get --- smdebug/tensorflow/base_hook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index c61fb3a17..de6175cbd 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -306,12 +306,12 @@ def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[Fil else: if shape_writers is False: return [self.writer_map[self.device_map[self.chief_worker]]] - else: + elif self._saving_shapes_in_step(): return [self.shape_writer_map[self.device_map[self.chief_worker]]] elif self.save_all_workers or worker == self.chief_worker: if shape_writers is False: return [self.writer_map[self.device_map[worker]]] - else: + elif self._saving_shapes_in_step(): return [self.shape_writer_map[self.device_map[worker]]] else: # training on CPU when all device strings have cpu From b5b29b1ced3cea5571b33da19ec054f3107a3c3f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 2 Sep 2020 16:03:29 -0700 Subject: [PATCH 35/40] Simplify by removing shape writer --- smdebug/core/hook.py | 37 ++----- smdebug/core/writer.py | 173 +++++++++++++++----------------- smdebug/tensorflow/base_hook.py | 59 ++--------- 3 files changed, 96 insertions(+), 173 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 5a123e625..aba7dae83 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -47,7 +47,7 @@ size_and_shape, validate_custom_tensor_value, ) -from smdebug.core.writer import FileWriter, ShapeWriter +from smdebug.core.writer import FileWriter from smdebug.exceptions import InvalidCollectionConfiguration try: @@ -223,7 +223,7 @@ def __init__( self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None - self.shape_writer = None + if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: @@ -344,12 +344,6 @@ def _get_collections_to_save_for_step(self) -> Set["Collection"]: ) return self._collections_to_save_for_step - def _saving_shapes_in_step(self) -> bool: - for coll in self._get_collections_to_save_for_step(): - if coll.reduction_config.save_shape is True: - return True - return False - def is_tensor_saved_for_step(self, tensor_name): collections_to_save = self._get_collections_to_save_for_step() for c in collections_to_save: @@ -452,10 +446,6 @@ def _close_writers(self) -> None: self._close_given_writer_map(self.tb_writers) - if self.shape_writer is not None: - self.shape_writer.close() - self.shape_writer = None - def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py if only_initialize_if_missing and self.writer: @@ -485,21 +475,10 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) - if self._saving_shapes_in_step(): - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, - step=self.step, - worker=self.worker, - index_writer=self.writer.index_writer, - ) - - def _get_single_process_writers(self, shape_writers=False) -> List[FileWriter]: - if shape_writers is False: - return [self.writer] if self.writer else [] - else: - return [self.shape_writer] if self.shape_writer else [] + def _get_single_process_writers(self) -> List[FileWriter]: + return [self.writer] if self.writer else [] - def _get_writers(self, tensor_name, tensor_ref=None, shape_writers=False) -> List[FileWriter]: + def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ :param tensor_name: :param tensor_ref: used by TF @@ -507,7 +486,7 @@ def _get_writers(self, tensor_name, tensor_ref=None, shape_writers=False) -> Lis """ if self.save_all_workers is False and self.worker != self.chief_worker: return [] - return self._get_single_process_writers(shape_writers) + return self._get_single_process_writers() def _maybe_get_tb_writer(self) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. @@ -777,7 +756,7 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_ break def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None): - shape_writers = self._get_writers(tensor_name, tensor_ref=tensor_ref, shape_writers=True) + writers = self._get_writers(tensor_name, tensor_ref=tensor_ref) for s_col in save_collections: reduction_config = s_col.reduction_config if self.dry_run is False and reduction_config.save_shape is True: @@ -788,7 +767,7 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N else: original_name = None - for writer in shape_writers: + for writer in writers: writer.write_shape( tensor_name, this_shape, diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 3966da82b..eeb50ee3a 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -46,52 +46,7 @@ logger = get_logger() -class BaseWriter: - def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL): - self.trial_dir = trial_dir - self.step = step - self.worker = worker - if worker is None: - assert False, "Worker should not be none. Check worker name initialization" - self.mode = mode - self._writer = None - self._index_writer = None - - def name(self): - return self._writer.name() - - def __enter__(self): - """Make usable with "with" statement.""" - return self - - def __exit__(self, unused_type, unused_value, unused_traceback): - """Make usable with "with" statement.""" - self.close() - - def flush(self): - """Flushes the event file to disk. - Call this method to make sure that all pending events have been written to disk. - """ - self._writer.flush() - # don't flush index writer as we only want to flush on close - - @classmethod - def create_index_writer(cls, trial_dir, worker, step): - el = TensorFileLocation(step_num=step, worker_name=worker) - event_file_path = el.get_file_location(trial_dir=trial_dir) - index_file_path = IndexFileLocationUtils.get_index_key_for_step(trial_dir, step, worker) - return IndexWriter(index_file_path) - - @property - def index_writer(self): - return self._index_writer - - @index_writer.setter - def index_writer(self, iw): - self._index_writer = iw - - -class FileWriter(BaseWriter): +class FileWriter: def __init__( self, trial_dir, @@ -103,7 +58,6 @@ def __init__( flush_secs=120, verbose=False, write_checksum=False, - index_writer=None, ): """Creates a `FileWriter` and an file. On construction the summary writer creates a new event file in `trial_dir`. @@ -125,35 +79,44 @@ def __init__( verbose : bool Determines whether to print logging messages. """ - super(FileWriter, self).__init__(trial_dir, worker, step, mode) + self.trial_dir = trial_dir + self.worker = worker + self.step = step + self.wtype = wtype + self.mode = mode + self.max_queue = max_queue + self.flush_secs = flush_secs + self.verbose = verbose + self.write_checksum = write_checksum + + self._proto_writer = None + if wtype == "events": - if index_writer is None: - self.index_writer = self.create_index_writer( - trial_dir=trial_dir, worker=worker, step=step - ) - else: - self.index_writer = index_writer el = TensorFileLocation(step_num=self.step, worker_name=self.worker) - event_file_path = el.get_file_location(trial_dir=self.trial_dir) + self.event_file_path = el.get_file_location(trial_dir=self.trial_dir) + index_file_path = IndexFileLocationUtils.get_index_key_for_step( + self.trial_dir, self.step, self.worker + ) + self.index_writer = IndexWriter(index_file_path) elif wtype == "tensorboard": el = TensorboardFileLocation( step_num=self.step, worker_name=self.worker, mode=self.mode ) - event_file_path = el.get_file_location(base_dir=self.trial_dir) + self.event_file_path = el.get_file_location(base_dir=self.trial_dir) self.index_writer = None else: assert False, "Writer type not supported: {}".format(wtype) - self._writer = EventFileWriter( - path=event_file_path, - index_writer=self.index_writer, - max_queue=max_queue, - flush_secs=flush_secs, - verbose=verbose, - write_checksum=write_checksum, - ) self._default_bins = _get_default_bins() + def __enter__(self): + """Make usable with "with" statement.""" + return self + + def __exit__(self, unused_type, unused_value, unused_traceback): + """Make usable with "with" statement.""" + self.close() + @staticmethod def _get_metadata(mode, mode_step): sm2 = SummaryMetadata.PluginData(plugin_name=MODE_STEP_PLUGIN_NAME, content=str(mode_step)) @@ -162,6 +125,27 @@ def _get_metadata(mode, mode_step): smd = SummaryMetadata(plugin_data=plugin_data) return smd + @property + def index_writer(self): + return self._index_writer + + @index_writer.setter + def index_writer(self, iw): + self._index_writer = iw + + @property + def proto_writer(self): + if self._proto_writer is None: + self._proto_writer = EventFileWriter( + path=self.event_file_path, + index_writer=self.index_writer, + max_queue=self.max_queue, + flush_secs=self.flush_secs, + verbose=self.verbose, + write_checksum=self.write_checksum, + ) + return self._proto_writer + def write_tensor( self, tdata, tname, write_index=True, mode=ModeKeys.GLOBAL, mode_step=None, timestamp=None ): @@ -172,14 +156,14 @@ def write_tensor( tensor_proto = make_tensor_proto(nparray_data=value, tag=tag) s = Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor_proto)]) if write_index: - self._writer.write_summary_with_index( + self.proto_writer.write_summary_with_index( s, self.step, tname, mode, mode_step, timestamp=timestamp ) else: - self._writer.write_summary(s, self.step, timestamp) + self.proto_writer.write_summary(s, self.step, timestamp) def write_graph(self, graph): - self._writer.write_graph(graph) + self.proto_writer.write_graph(graph) def write_pytorch_graph(self, graph_profile): # https://github.com/pytorch/pytorch/blob/c749be9e9f8dd3db8b3582e93f917bd47e8e9e20/torch/utils/tensorboard/writer.py # L99 @@ -187,13 +171,13 @@ def write_pytorch_graph(self, graph_profile): graph = graph_profile[0] stepstats = graph_profile[1] event = Event(graph_def=graph.SerializeToString()) - self._writer.write_event(event) + self.proto_writer.write_event(event) trm = TaggedRunMetadata(tag="step1", run_metadata=stepstats.SerializeToString()) event = Event(tagged_run_metadata=trm) - self._writer.write_event(event) + self.proto_writer.write_event(event) def write_summary(self, summ, global_step, timestamp: float = None): - self._writer.write_summary(summ, global_step, timestamp=timestamp) + self.proto_writer.write_summary(summ, global_step, timestamp=timestamp) def write_histogram_summary(self, tdata, tname, global_step, bins="default"): """Add histogram data to the event file. @@ -222,22 +206,44 @@ def write_histogram_summary(self, tdata, tname, global_step, bins="default"): bins = self._default_bins try: s = histogram_summary(tname, tdata, bins) - self._writer.write_summary(s, global_step) + self.proto_writer.write_summary(s, global_step) except ValueError as e: logger.warning(f"Unable to write histogram {tname} at {global_step}: {e}") def write_scalar_summary(self, name, value, global_step, timestamp: float = None): s = scalar_summary(name, value) - self._writer.write_summary(s, global_step, timestamp=timestamp) + self.proto_writer.write_summary(s, global_step, timestamp=timestamp) + + def write_shape( + self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None + ): + self.index_writer.add_shape( + TensorShape(name, mode.name, mode_step, shape, original_name=original_name) + ) def close(self): """Flushes the event file to disk and close the file. Call this method when you do not need the summary writer anymore. """ - self._writer.close() + if self._proto_writer is not None: + self.proto_writer.close() if self.index_writer is not None: self.index_writer.close() + def flush(self): + """Flushes the event file to disk. + Call this method to make sure that all pending events have been written to disk. + """ + if self._proto_writer is not None: + self._proto_writer.flush() + # don't flush index writer as we only want to flush on close + + def name(self): + if self._proto_writer: + return self._proto_writer.name() + else: + return None + @staticmethod def _check_mode_step(mode, mode_step, global_step): if mode_step is None: @@ -249,24 +255,3 @@ def _check_mode_step(mode, mode_step, global_step): ex_str = "mode can be one of " + ", ".join(mode_keys) raise ValueError(ex_str) return mode, mode_step - - -class ShapeWriter(BaseWriter): - def __init__(self, trial_dir, worker, index_writer, step=0, mode=ModeKeys.GLOBAL): - super(ShapeWriter, self).__init__(trial_dir, worker, step, mode) - self._index_writer = index_writer - - def write_shape( - self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None - ): - self._index_writer.add_shape( - TensorShape(name, mode.name, mode_step, shape, original_name=original_name) - ) - - def flush(self): - self._index_writer.flush() - - def close(self): - """Flushes the event file to disk and close the file. - """ - self._index_writer.close() diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index de6175cbd..9e5cd646d 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -16,7 +16,7 @@ from smdebug.core.reductions import get_numpy_reduction, get_reduction_tensor_name from smdebug.core.tfevent.util import make_numpy_array from smdebug.core.utils import serialize_tf_device -from smdebug.core.writer import FileWriter, ShapeWriter +from smdebug.core.writer import FileWriter # Local from .collection import CollectionKeys, CollectionManager @@ -87,7 +87,7 @@ def __init__( Example -> /job:worker/replica:0/task:1/device:GPU:0 : _job-worker_replica-0_task-1_device-GPU-0""" self.device_map = {} self.writer_map = {} - self.shape_writer_map = {} + # This will be None if the var wasn't set, i.e. not param server self.tf_config_json = load_tf_config_json(os.getenv("TF_CONFIG")) self._hook_supported = None @@ -271,7 +271,7 @@ def _set_chief_worker(self): elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED: raise NotImplementedError - def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[FileWriter]: + def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: """ For tensors generated during distributed tf jobs, we map the tensor to a writer with its device attribute. @@ -288,7 +288,7 @@ def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[Fil TFDistributionStrategy.HOROVOD, ]: if self.save_all_workers is True or self.worker == self.chief_worker: - return self._get_single_process_writers(shape_writers) + return self._get_single_process_writers() elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): # else is for metrics in Keras @@ -299,25 +299,16 @@ def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[Fil # if device str is empty or cpu in worker if not bool(worker) or "CPU" in worker: if self.save_all_workers: - if shape_writers is False: - return list(self.writer_map.values()) - else: - return list(self.shape_writer_map.values()) + return list(self.writer_map.values()) else: - if shape_writers is False: - return [self.writer_map[self.device_map[self.chief_worker]]] - elif self._saving_shapes_in_step(): - return [self.shape_writer_map[self.device_map[self.chief_worker]]] + return [self.writer_map[self.device_map[self.chief_worker]]] elif self.save_all_workers or worker == self.chief_worker: - if shape_writers is False: - return [self.writer_map[self.device_map[worker]]] - elif self._saving_shapes_in_step(): - return [self.shape_writer_map[self.device_map[worker]]] + return [self.writer_map[self.device_map[worker]]] else: # training on CPU when all device strings have cpu - return self._get_single_process_writers(shape_writers) + return self._get_single_process_writers() elif self.distribution_strategy == TFDistributionStrategy.NONE: - return self._get_single_process_writers(shape_writers) + return self._get_single_process_writers() else: raise NotImplementedError # when self.writer is None, returns empty list @@ -338,13 +329,6 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) - if self._saving_shapes_in_step(): - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, - step=self.step, - worker=self.worker, - index_writer=self.writer.index_writer, - ) elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): for device, device_string in self.device_map.items(): @@ -354,37 +338,16 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer_map[device_string] = FileWriter( trial_dir=self.out_dir, step=self.step, worker=device_string ) - if self._saving_shapes_in_step(): - self.shape_writer_map[device_string] = ShapeWriter( - trial_dir=self.out_dir, - step=self.step, - worker=self.worker, - index_writer=self.writer_map[device_string].index_writer, - ) else: # training on CPU when all device strings have cpu if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter( trial_dir=self.out_dir, step=self.step, worker=self.worker ) - if self._saving_shapes_in_step(): - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, - step=self.step, - worker=self.worker, - index_writer=self.writer.index_writer, - ) elif self.distribution_strategy == TFDistributionStrategy.NONE: if self.writer is None or only_initialize_if_missing is False: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) - if self._saving_shapes_in_step(): - self.shape_writer = ShapeWriter( - trial_dir=self.out_dir, - step=self.step, - worker=self.worker, - index_writer=self.writer.index_writer, - ) else: raise NotImplementedError @@ -404,10 +367,6 @@ def _close_writers(self) -> None: self._close_given_writer_map(self.shape_writer_map) self._close_given_writer_map(self.tb_writers) - if self.shape_writer is not None: - self.shape_writer.close() - self.shape_writer = None - def _export_model(self): tb_writer = self._maybe_get_tb_writer() if tb_writer: From 131ec443b1f295b466dbeda020f43d818e0d6753 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 2 Sep 2020 16:12:24 -0700 Subject: [PATCH 36/40] Cleanup --- smdebug/tensorflow/base_hook.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 9e5cd646d..ce2aac78b 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -364,7 +364,6 @@ def _close_writers(self) -> None: self.writer = None self._close_given_writer_map(self.writer_map) - self._close_given_writer_map(self.shape_writer_map) self._close_given_writer_map(self.tb_writers) def _export_model(self): From dee71062f4b48de145de896fc5fbd124af892cd0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 2 Sep 2020 16:29:41 -0700 Subject: [PATCH 37/40] Fix name of writer --- smdebug/core/writer.py | 4 +++- tests/core/test_numpy.py | 8 ++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index eeb50ee3a..29ed7daaa 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -242,7 +242,9 @@ def name(self): if self._proto_writer: return self._proto_writer.name() else: - return None + raise RuntimeError( + "Writer hasn't been initialized yet. It will be initialized when the first tensor or summary is written." + ) @staticmethod def _check_mode_step(mode, mode_step, global_step): diff --git a/tests/core/test_numpy.py b/tests/core/test_numpy.py index 25f7877ce..abe54a3fe 100644 --- a/tests/core/test_numpy.py +++ b/tests/core/test_numpy.py @@ -15,11 +15,10 @@ def rw(path): Checks that we can save data and read it back the way it was """ with FileWriter(trial_dir=path + "/my_trial", step=20, worker="algo-1") as fw: - fname = fw.name() - print(f"Saving data in {fname}") for i in range(10): data = np.ones(shape=(4, 4), dtype=np.float32) * i fw.write_tensor(tdata=data, tname=f"foo_{i}") + fname = fw.name() fr = FileReader(fname=fname) for i, ts in enumerate(fr.read_tensors()): @@ -47,17 +46,14 @@ def test_s3(): key_name = f"outputs/core-tests-{uuid.uuid4()}" # sagemaker-us-east-1-722321484884 location = "s3://{}/{}".format(bucket_name, key_name) - print("Saving to Location") rw(location) def test_string(): with FileWriter(trial_dir="/tmp/ts_output/my_trial", step=20, worker="algo-1") as fw: - fname = fw.name() - print(f"Saving string data in {fname}") s_written = np.array(["foo", "barz"]) fw.write_tensor(tdata=s_written, tname=f"foo_string") - + fname = fw.name() fr = FileReader(fname=fname) read = list(fr.read_tensors()) assert len(read) == 1 From 5c89fa4f8dea45e6bfd9ebc7f7941749ae7a06f1 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 8 Sep 2020 11:19:16 -0700 Subject: [PATCH 38/40] Addressed review comments --- smdebug/core/hook.py | 7 +++++-- smdebug/core/locations.py | 4 +--- smdebug/core/tensor.py | 5 +++-- smdebug/tensorflow/base_hook.py | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index aba7dae83..cb53a947e 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -475,7 +475,7 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None: self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker) - def _get_single_process_writers(self) -> List[FileWriter]: + def _get_main_writer(self) -> List[FileWriter]: return [self.writer] if self.writer else [] def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: @@ -486,7 +486,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: """ if self.save_all_workers is False and self.worker != self.chief_worker: return [] - return self._get_single_process_writers() + return self._get_main_writer() def _maybe_get_tb_writer(self) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. @@ -762,6 +762,9 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N if self.dry_run is False and reduction_config.save_shape is True: numpy_tensor_value = self._make_numpy_array(tensor_value) this_size, this_shape = size_and_shape(numpy_tensor_value) + # In TF Keras and Variables in all interfaces of TF, sometimes we output tensors with + # more meaningful names than the origina name. Outputting + # both Smdebug given name and original name in such cases if tensor_ref is not None and tensor_ref.tf_obj is not None: original_name = tensor_ref.tf_obj.name else: diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 9712f58a2..c50cdda20 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -26,10 +26,8 @@ def to_dict(self): class TensorShape: def __init__(self, name, mode, mode_step, shape, original_name=None): - if original_name is None: - original_name = name self.name = name - self.original_name = original_name + self.original_name = original_name if original_name is not None else name self.mode = mode self.mode_step = mode_step self.shape = tuple(shape) diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py index de52f7858..c48375fdc 100644 --- a/smdebug/core/tensor.py +++ b/smdebug/core/tensor.py @@ -138,8 +138,9 @@ class Tensor: def __init__(self, name, trial, cache): self._mode_steps = {} self.name = name - # SMdebug modifies some names of tensors to be more descriptive - # In such cases we save here the original name + # In TF Keras and Variables in all interfaces of TF, + # SMDebug modifies some names of tensors to be more descriptive. + # In such cases we save here the original name. self.original_name = None self.trial = trial self.cache = cache diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index ce2aac78b..9a8d46e4e 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -288,7 +288,7 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: TFDistributionStrategy.HOROVOD, ]: if self.save_all_workers is True or self.worker == self.chief_worker: - return self._get_single_process_writers() + return self._get_main_writer() elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: if len(self.device_map): # else is for metrics in Keras @@ -306,9 +306,9 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]: return [self.writer_map[self.device_map[worker]]] else: # training on CPU when all device strings have cpu - return self._get_single_process_writers() + return self._get_main_writer() elif self.distribution_strategy == TFDistributionStrategy.NONE: - return self._get_single_process_writers() + return self._get_main_writer() else: raise NotImplementedError # when self.writer is None, returns empty list From a893d91f7e3ca882a1873f58372614b76a510788 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 8 Sep 2020 15:18:11 -0700 Subject: [PATCH 39/40] trigger ci From 1f94933d54a5a3909e18a892c72432bf8feab555 Mon Sep 17 00:00:00 2001 From: NihalHarish Date: Tue, 8 Sep 2020 15:57:06 -0700 Subject: [PATCH 40/40] retrigger CI