From 3e708013e87f705ec2d9d68fef6f292609672345 Mon Sep 17 00:00:00 2001 From: beda Date: Mon, 28 Jan 2019 17:47:55 +0100 Subject: [PATCH 01/10] Add profiling of neural networks --- emloop_tensorflow/frozen_model.py | 26 +++++++++++++++++++++++--- emloop_tensorflow/model.py | 22 +++++++++++++++++++--- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/emloop_tensorflow/frozen_model.py b/emloop_tensorflow/frozen_model.py index f2dbbc4..7038127 100644 --- a/emloop_tensorflow/frozen_model.py +++ b/emloop_tensorflow/frozen_model.py @@ -5,6 +5,7 @@ import emloop as el import tensorflow as tf +from tensorflow.python.client import timeline from .graph_tower import GraphTower from .model import BaseModel @@ -29,16 +30,18 @@ class FrozenModel(el.AbstractModel): """ def __init__(self, - inputs: List[str], outputs: List[str], restore_from: str, - session_config: Optional[dict]=None, n_gpus: int=0, **_): + log_dir: str, inputs: List[str], outputs: List[str], restore_from: str, + session_config: Optional[dict]=None, n_gpus: int=0, profile: bool=False, **_): """ Initialize new :py:class:`FrozenModel` instance. + :param log_dir: path to the logging directory (wherein models should be saved) :param inputs: model input names :param outputs: model output names :param restore_from: restore model path (either a dir or a .pb file) :param session_config: TF session configuration dict :param n_gpus: number of GPUs to use (either 0 or 1) + :param profile: whether profile.json should be saved to log_dir """ super().__init__(None, '', restore_from) assert 0 <= n_gpus <= 1, 'FrozenModel can be used only with n_gpus=0 or n_gpus=1' @@ -50,6 +53,7 @@ def __init__(self, self._graph = tf.Graph() if session_config: session_config = tf.ConfigProto(**session_config) + self._session = tf.Session(graph=self._graph, config=session_config) with self._graph.as_default(): @@ -60,6 +64,10 @@ def __init__(self, except KeyError: self._is_training = tf.placeholder(tf.bool, [], BaseModel.TRAINING_FLAG_NAME) + self._profile = profile + self._log_dir = log_dir + + def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrapper=None) -> Mapping[str, object]: """ Run the model with the given ``batch``. @@ -83,7 +91,19 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap for output_name in self.output_names: fetches.append(self._tower[output_name]) - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) + if self._profile: + run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + run_metadata = tf.RunMetadata() + + outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, + options=run_options, run_metadata=run_metadata) + + with open(path.join(self._log_dir, "profile.json"), "w") as ofile: + tl = timeline.Timeline(run_metadata.step_stats) + ofile.write(tl.generate_chrome_trace_format()) + + else: + outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) return dict(zip(self.output_names, outputs)) diff --git a/emloop_tensorflow/model.py b/emloop_tensorflow/model.py index 9e1405e..b896d17 100644 --- a/emloop_tensorflow/model.py +++ b/emloop_tensorflow/model.py @@ -8,6 +8,7 @@ import numpy as np import emloop as el import tensorflow as tf +from tensorflow.python.client import timeline from .third_party.tensorflow.freeze_graph import freeze_graph from .third_party.tensorflow.average_gradients import average_gradients @@ -44,7 +45,7 @@ def __init__(self, # pylint: disable=too-many-arguments dataset: Optional[el.AbstractDataset], log_dir: Optional[str], inputs: List[str], outputs: List[str], session_config: Optional[dict]=None, n_gpus: int=0, restore_from: Optional[str]=None, optimizer=None, freeze=False, loss_name: str=DEFAULT_LOSS_NAME, monitor: Optional[str]=None, - restore_fallback: Optional[str]=None, clip_gradient: Optional[float]=None, + restore_fallback: Optional[str]=None, clip_gradient: Optional[float]=None, profile: bool=False, **kwargs): """ Create new emloop trainable TensorFlow model. @@ -82,6 +83,7 @@ def __init__(self, # pylint: disable=too-many-arguments :param monitor: monitor signal mean and variance of the tensors which names contain the specified value :param restore_fallback: ignored arg. (allows training from configs saved by emloop where it is added) :param clip_gradient: limit the absolute value of the gradient; set to None for no clipping + :param profile: whether profile.json should be saved to log_dir :param kwargs: additional kwargs forwarded to :py:meth:`_create_model` """ super().__init__(dataset=dataset, log_dir=log_dir, restore_from=restore_from) @@ -91,6 +93,7 @@ def __init__(self, # pylint: disable=too-many-arguments self._log_dir = log_dir self._freeze_graph = freeze self._clip_gradient = clip_gradient + self._profile = profile self._loss_name = loss_name self._train_ops = [] self._graph = self._saver = None @@ -223,12 +226,25 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap for output_name in self.output_names: fetches.append(tower[output_name]) + run_options = None + run_metadata = None + if self._profile: + run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + run_metadata = tf.RunMetadata() + # run the computational graph for one batch and allow buffering in the meanwhile if stream is not None: with stream.allow_buffering: - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) + outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, + options=run_options, run_metadata=run_metadata) else: - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) + outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, + options=run_options, run_metadata=run_metadata) + + if self._profile: + with open(path.join(self._log_dir, "profile.json"), "w") as ofile: + tl = timeline.Timeline(run_metadata.step_stats) + ofile.write(tl.generate_chrome_trace_format()) if train: outputs = outputs[1:] From 778c47c0c4ac193caa2b31c7d5c5f34d27a0e963 Mon Sep 17 00:00:00 2001 From: beda Date: Mon, 28 Jan 2019 19:03:40 +0100 Subject: [PATCH 02/10] Tests --- emloop_tensorflow/tests/frozen_model_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/emloop_tensorflow/tests/frozen_model_test.py b/emloop_tensorflow/tests/frozen_model_test.py index 190938a..2631a0d 100644 --- a/emloop_tensorflow/tests/frozen_model_test.py +++ b/emloop_tensorflow/tests/frozen_model_test.py @@ -18,24 +18,24 @@ def test_frozen_model_restore(tmpdir): """Test frozen model restoration.""" with pytest.raises(ValueError): - FrozenModel(inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet + FrozenModel(log_dir="/dev/null", inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet dummy_model = TrainableModel(dataset=None, log_dir=tmpdir, **_IO, freeze=True, optimizer=_OPTIMIZER) dummy_model.save('') # restore from directory - FrozenModel(**_IO, restore_from=tmpdir) + FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir) # restore from file - FrozenModel(**_IO, restore_from=path.join(tmpdir, 'model.pb')) + FrozenModel(log_dir="/dev/null", **_IO, restore_from=path.join(tmpdir, 'model.pb')) # wrong configurations dummy_model.save('another') with pytest.raises(ValueError): - FrozenModel(**_IO, restore_from=tmpdir) # multiple .pb files + FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir) # multiple .pb files with pytest.raises(ValueError): - FrozenModel(**_IO, restore_from='/something/that/does/not/exist') + FrozenModel(log_dir="/dev/null", **_IO, restore_from='/something/that/does/not/exist') def test_frozen_model_misc(tmpdir): @@ -44,7 +44,7 @@ def test_frozen_model_misc(tmpdir): dummy_model.save('') # restore from directory - frozen_model = FrozenModel(**_IO, restore_from=tmpdir, session_config={'allow_soft_placement': True}) + frozen_model = FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir, session_config={'allow_soft_placement': True}) assert frozen_model.restore_fallback == 'emloop_tensorflow.FrozenModel' assert frozen_model.input_names == _IO['inputs'] @@ -63,7 +63,7 @@ def test_frozen_model_run(tmpdir): mainloop.run_training(None) model.save('') - frozen_model = FrozenModel(inputs=['input'], outputs=['output'], restore_from=tmpdir) + frozen_model = FrozenModel(log_dir="/dev/null", inputs=['input'], outputs=['output'], restore_from=tmpdir) with pytest.raises(AssertionError): frozen_model.run({}, True, None) From d2e5ea38e378d1df83c7ed27d3f792ffd56921fb Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:01:22 +0100 Subject: [PATCH 03/10] Refactor profiling --- emloop_tensorflow/frozen_model.py | 30 +++++-------- emloop_tensorflow/model.py | 47 +++++++++++--------- emloop_tensorflow/tests/frozen_model_test.py | 15 ++++--- emloop_tensorflow/utils/__init__.py | 3 +- emloop_tensorflow/utils/profiler.py | 40 +++++++++++++++++ 5 files changed, 89 insertions(+), 46 deletions(-) create mode 100644 emloop_tensorflow/utils/profiler.py diff --git a/emloop_tensorflow/frozen_model.py b/emloop_tensorflow/frozen_model.py index 7038127..37bd1df 100644 --- a/emloop_tensorflow/frozen_model.py +++ b/emloop_tensorflow/frozen_model.py @@ -5,10 +5,10 @@ import emloop as el import tensorflow as tf -from tensorflow.python.client import timeline from .graph_tower import GraphTower from .model import BaseModel +from .utils import Profiler class FrozenModel(el.AbstractModel): @@ -29,19 +29,19 @@ class FrozenModel(el.AbstractModel): """ - def __init__(self, - log_dir: str, inputs: List[str], outputs: List[str], restore_from: str, - session_config: Optional[dict]=None, n_gpus: int=0, profile: bool=False, **_): + def __init__(self, inputs: List[str], outputs: List[str], restore_from: str, log_dir: Optional[str]=None, + session_config: Optional[dict]=None, n_gpus: int=0, profile: bool=False, keep_profiles: int=5, **_): """ Initialize new :py:class:`FrozenModel` instance. - :param log_dir: path to the logging directory (wherein models should be saved) + :param log_dir: output directory :param inputs: model input names :param outputs: model output names :param restore_from: restore model path (either a dir or a .pb file) :param session_config: TF session configuration dict :param n_gpus: number of GPUs to use (either 0 or 1) - :param profile: whether profile.json should be saved to log_dir + :param profile: if true, profile the speed of model inference and save profiles to the specified log_dir + :param keep_profiles: how many profiles are saved """ super().__init__(None, '', restore_from) assert 0 <= n_gpus <= 1, 'FrozenModel can be used only with n_gpus=0 or n_gpus=1' @@ -64,9 +64,12 @@ def __init__(self, except KeyError: self._is_training = tf.placeholder(tf.bool, [], BaseModel.TRAINING_FLAG_NAME) - self._profile = profile - self._log_dir = log_dir + if profile and not log_dir: + raise ValueError('log_dir has to be specified with profile set to True') + self._profile = profile + if profile: + self._profiler = Profiler(log_dir, keep_profiles) def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrapper=None) -> Mapping[str, object]: """ @@ -92,16 +95,7 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap fetches.append(self._tower[output_name]) if self._profile: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, - options=run_options, run_metadata=run_metadata) - - with open(path.join(self._log_dir, "profile.json"), "w") as ofile: - tl = timeline.Timeline(run_metadata.step_stats) - ofile.write(tl.generate_chrome_trace_format()) - + outputs = self._profiler.run(self._session, fetches, feed_dict) else: outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) diff --git a/emloop_tensorflow/model.py b/emloop_tensorflow/model.py index b896d17..1874ad2 100644 --- a/emloop_tensorflow/model.py +++ b/emloop_tensorflow/model.py @@ -2,17 +2,16 @@ import logging from os import path from abc import ABCMeta -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Dict from glob import glob import numpy as np import emloop as el import tensorflow as tf -from tensorflow.python.client import timeline from .third_party.tensorflow.freeze_graph import freeze_graph from .third_party.tensorflow.average_gradients import average_gradients -from .utils import create_optimizer +from .utils import create_optimizer, Profiler from .graph_tower import GraphTower DEFAULT_LOSS_NAME = 'loss' @@ -46,7 +45,7 @@ def __init__(self, # pylint: disable=too-many-arguments session_config: Optional[dict]=None, n_gpus: int=0, restore_from: Optional[str]=None, optimizer=None, freeze=False, loss_name: str=DEFAULT_LOSS_NAME, monitor: Optional[str]=None, restore_fallback: Optional[str]=None, clip_gradient: Optional[float]=None, profile: bool=False, - **kwargs): + keep_profiles: int=5, **kwargs): """ Create new emloop trainable TensorFlow model. @@ -84,6 +83,7 @@ def __init__(self, # pylint: disable=too-many-arguments :param restore_fallback: ignored arg. (allows training from configs saved by emloop where it is added) :param clip_gradient: limit the absolute value of the gradient; set to None for no clipping :param profile: whether profile.json should be saved to log_dir + :param keep_profiles: if true, profile the speed of model inference and save profiles to the specified log_dir :param kwargs: additional kwargs forwarded to :py:meth:`_create_model` """ super().__init__(dataset=dataset, log_dir=log_dir, restore_from=restore_from) @@ -93,7 +93,6 @@ def __init__(self, # pylint: disable=too-many-arguments self._log_dir = log_dir self._freeze_graph = freeze self._clip_gradient = clip_gradient - self._profile = profile self._loss_name = loss_name self._train_ops = [] self._graph = self._saver = None @@ -101,6 +100,13 @@ def __init__(self, # pylint: disable=too-many-arguments if n_gpus == 0: self._towers.append(GraphTower(-1, inputs, outputs, loss_name)) + if profile and not log_dir: + raise ValueError('log_dir has to be specified with profile set to True') + + self._profile = profile + if profile: + self._profiler = Profiler(log_dir, keep_profiles) + logging.info('\tCreating TF model on %s GPU devices', n_gpus) self._graph = tf.Graph() self._session = self._create_session(session_config) @@ -197,6 +203,20 @@ def session(self) -> tf.Session: """TF session object.""" return self._session + + def evaluate_graph(self, fetches: Dict, feed_dict: Dict): + """ + Evaluates tensorflow graph. + + :param fetches: names of output tensors + :param feed_dict: input tensors + """ + if self._profile: + return self._profiler.run(self._session, fetches, feed_dict) + + return self._session.run(fetches, feed_dict) + + def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrapper=None) -> Mapping[str, object]: """ Run the model with the given ``batch``. Update the trainable variables only if ``train`` is true. @@ -226,25 +246,12 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap for output_name in self.output_names: fetches.append(tower[output_name]) - run_options = None - run_metadata = None - if self._profile: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - # run the computational graph for one batch and allow buffering in the meanwhile if stream is not None: with stream.allow_buffering: - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, - options=run_options, run_metadata=run_metadata) + outputs = self.evaluate_graph(fetches, feed_dict) else: - outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, - options=run_options, run_metadata=run_metadata) - - if self._profile: - with open(path.join(self._log_dir, "profile.json"), "w") as ofile: - tl = timeline.Timeline(run_metadata.step_stats) - ofile.write(tl.generate_chrome_trace_format()) + outputs = self.evaluate_graph(fetches, feed_dict) if train: outputs = outputs[1:] diff --git a/emloop_tensorflow/tests/frozen_model_test.py b/emloop_tensorflow/tests/frozen_model_test.py index 2631a0d..0805b2e 100644 --- a/emloop_tensorflow/tests/frozen_model_test.py +++ b/emloop_tensorflow/tests/frozen_model_test.py @@ -18,24 +18,24 @@ def test_frozen_model_restore(tmpdir): """Test frozen model restoration.""" with pytest.raises(ValueError): - FrozenModel(log_dir="/dev/null", inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet + FrozenModel(, inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet dummy_model = TrainableModel(dataset=None, log_dir=tmpdir, **_IO, freeze=True, optimizer=_OPTIMIZER) dummy_model.save('') # restore from directory - FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir) + FrozenModel(, **_IO, restore_from=tmpdir) # restore from file - FrozenModel(log_dir="/dev/null", **_IO, restore_from=path.join(tmpdir, 'model.pb')) + FrozenModel(, **_IO, restore_from=path.join(tmpdir, 'model.pb')) # wrong configurations dummy_model.save('another') with pytest.raises(ValueError): - FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir) # multiple .pb files + FrozenModel(, **_IO, restore_from=tmpdir) # multiple .pb files with pytest.raises(ValueError): - FrozenModel(log_dir="/dev/null", **_IO, restore_from='/something/that/does/not/exist') + FrozenModel(, **_IO, restore_from='/something/that/does/not/exist') def test_frozen_model_misc(tmpdir): @@ -44,7 +44,8 @@ def test_frozen_model_misc(tmpdir): dummy_model.save('') # restore from directory - frozen_model = FrozenModel(log_dir="/dev/null", **_IO, restore_from=tmpdir, session_config={'allow_soft_placement': True}) + frozen_model = FrozenModel(, **_IO, restore_from=tmpdir, + session_config={'allow_soft_placement': True}) assert frozen_model.restore_fallback == 'emloop_tensorflow.FrozenModel' assert frozen_model.input_names == _IO['inputs'] @@ -63,7 +64,7 @@ def test_frozen_model_run(tmpdir): mainloop.run_training(None) model.save('') - frozen_model = FrozenModel(log_dir="/dev/null", inputs=['input'], outputs=['output'], restore_from=tmpdir) + frozen_model = FrozenModel(, inputs=['input'], outputs=['output'], restore_from=tmpdir) with pytest.raises(AssertionError): frozen_model.run({}, True, None) diff --git a/emloop_tensorflow/utils/__init__.py b/emloop_tensorflow/utils/__init__.py index f74680e..c3e7e92 100644 --- a/emloop_tensorflow/utils/__init__.py +++ b/emloop_tensorflow/utils/__init__.py @@ -2,5 +2,6 @@ Module with TensorFlow util functions. """ from .reflection import create_activation, create_optimizer +from .profiler import Profiler -__all__ = ['create_activation', 'create_optimizer'] +__all__ = ['create_activation', 'create_optimizer', 'Profiler'] diff --git a/emloop_tensorflow/utils/profiler.py b/emloop_tensorflow/utils/profiler.py new file mode 100644 index 0000000..477c0e1 --- /dev/null +++ b/emloop_tensorflow/utils/profiler.py @@ -0,0 +1,40 @@ +import tensorflow as tf +from tensorflow.python.client import timeline +from typing import Dict +import os + + +class Profiler: + """ + Profiles neural networks and saves the profiles. + """ + + def __init__(self, log_dir: str, keep_profiles: int): + """ + :param log_dir: directory where profiles will be saved + :param keep_profiles: how many profiles are saved + """ + self._log_dir = log_dir + self._profile_counter = 0 + self._keep_profiles = keep_profiles + self._run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + + def run(self, session: tf.Session, fetches: Dict, feed_dict: Dict): + """ + Evaluates the tensorflow graph with profiling, saves profile and returns outputs. + + :param session: tensorflow session + :param fetches: names of output tensors + :param feed_dict: input tensors + """ + run_metadata = tf.RunMetadata() + outputs = session.run(fetches=fetches, feed_dict=feed_dict, + options=self._run_options, run_metadata=run_metadata) + + with open(os.path.join(self._log_dir, f'profile_{self._profile_counter}.json'), 'w') as ofile: + tl = timeline.Timeline(run_metadata.step_stats) + ofile.write(tl.generate_chrome_trace_format()) + + self._profile_counter = (self._profile_counter + 1) % self._keep_profiles + + return outputs From 9f7bb5552c6be8fc6b7933596feb874dcabfc832 Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:04:47 +0100 Subject: [PATCH 04/10] Cosmetic changes --- emloop_tensorflow/frozen_model.py | 1 - emloop_tensorflow/model.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/emloop_tensorflow/frozen_model.py b/emloop_tensorflow/frozen_model.py index 37bd1df..29b4d4c 100644 --- a/emloop_tensorflow/frozen_model.py +++ b/emloop_tensorflow/frozen_model.py @@ -53,7 +53,6 @@ def __init__(self, inputs: List[str], outputs: List[str], restore_from: str, log self._graph = tf.Graph() if session_config: session_config = tf.ConfigProto(**session_config) - self._session = tf.Session(graph=self._graph, config=session_config) with self._graph.as_default(): diff --git a/emloop_tensorflow/model.py b/emloop_tensorflow/model.py index 1874ad2..7bab04a 100644 --- a/emloop_tensorflow/model.py +++ b/emloop_tensorflow/model.py @@ -82,7 +82,7 @@ def __init__(self, # pylint: disable=too-many-arguments :param monitor: monitor signal mean and variance of the tensors which names contain the specified value :param restore_fallback: ignored arg. (allows training from configs saved by emloop where it is added) :param clip_gradient: limit the absolute value of the gradient; set to None for no clipping - :param profile: whether profile.json should be saved to log_dir + :param profile: if true, profile the speed of model inference and save profiles to the specified log_dir :param keep_profiles: if true, profile the speed of model inference and save profiles to the specified log_dir :param kwargs: additional kwargs forwarded to :py:meth:`_create_model` """ From 44eaf7b76ae1f1a3d3e2bd437962a04e0de5e936 Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:08:14 +0100 Subject: [PATCH 05/10] Tests --- emloop_tensorflow/tests/frozen_model_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/emloop_tensorflow/tests/frozen_model_test.py b/emloop_tensorflow/tests/frozen_model_test.py index 0805b2e..9a338fe 100644 --- a/emloop_tensorflow/tests/frozen_model_test.py +++ b/emloop_tensorflow/tests/frozen_model_test.py @@ -18,24 +18,24 @@ def test_frozen_model_restore(tmpdir): """Test frozen model restoration.""" with pytest.raises(ValueError): - FrozenModel(, inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet + FrozenModel(inputs=[], outputs=[], restore_from=tmpdir) # there is no .pb file yet dummy_model = TrainableModel(dataset=None, log_dir=tmpdir, **_IO, freeze=True, optimizer=_OPTIMIZER) dummy_model.save('') # restore from directory - FrozenModel(, **_IO, restore_from=tmpdir) + FrozenModel(**_IO, restore_from=tmpdir) # restore from file - FrozenModel(, **_IO, restore_from=path.join(tmpdir, 'model.pb')) + FrozenModel(**_IO, restore_from=path.join(tmpdir, 'model.pb')) # wrong configurations dummy_model.save('another') with pytest.raises(ValueError): - FrozenModel(, **_IO, restore_from=tmpdir) # multiple .pb files + FrozenModel(**_IO, restore_from=tmpdir) # multiple .pb files with pytest.raises(ValueError): - FrozenModel(, **_IO, restore_from='/something/that/does/not/exist') + FrozenModel(**_IO, restore_from='/something/that/does/not/exist') def test_frozen_model_misc(tmpdir): @@ -44,7 +44,7 @@ def test_frozen_model_misc(tmpdir): dummy_model.save('') # restore from directory - frozen_model = FrozenModel(, **_IO, restore_from=tmpdir, + frozen_model = FrozenModel(**_IO, restore_from=tmpdir, session_config={'allow_soft_placement': True}) assert frozen_model.restore_fallback == 'emloop_tensorflow.FrozenModel' @@ -64,7 +64,7 @@ def test_frozen_model_run(tmpdir): mainloop.run_training(None) model.save('') - frozen_model = FrozenModel(, inputs=['input'], outputs=['output'], restore_from=tmpdir) + frozen_model = FrozenModel(inputs=['input'], outputs=['output'], restore_from=tmpdir) with pytest.raises(AssertionError): frozen_model.run({}, True, None) From 970da2b800b4f8b20d6adca3a3898d8fec83b123 Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:10:09 +0100 Subject: [PATCH 06/10] Cosmetic changes --- emloop_tensorflow/tests/frozen_model_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/emloop_tensorflow/tests/frozen_model_test.py b/emloop_tensorflow/tests/frozen_model_test.py index 9a338fe..190938a 100644 --- a/emloop_tensorflow/tests/frozen_model_test.py +++ b/emloop_tensorflow/tests/frozen_model_test.py @@ -44,8 +44,7 @@ def test_frozen_model_misc(tmpdir): dummy_model.save('') # restore from directory - frozen_model = FrozenModel(**_IO, restore_from=tmpdir, - session_config={'allow_soft_placement': True}) + frozen_model = FrozenModel(**_IO, restore_from=tmpdir, session_config={'allow_soft_placement': True}) assert frozen_model.restore_fallback == 'emloop_tensorflow.FrozenModel' assert frozen_model.input_names == _IO['inputs'] From e2ff8c87ebca7504a23fc19261b82f23ffbaef1d Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:12:51 +0100 Subject: [PATCH 07/10] Cosmetic changes --- emloop_tensorflow/utils/profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emloop_tensorflow/utils/profiler.py b/emloop_tensorflow/utils/profiler.py index 477c0e1..f39abd2 100644 --- a/emloop_tensorflow/utils/profiler.py +++ b/emloop_tensorflow/utils/profiler.py @@ -6,7 +6,7 @@ class Profiler: """ - Profiles neural networks and saves the profiles. + Profiles tensorflow graphs and saves the profiles. """ def __init__(self, log_dir: str, keep_profiles: int): @@ -29,7 +29,7 @@ def run(self, session: tf.Session, fetches: Dict, feed_dict: Dict): """ run_metadata = tf.RunMetadata() outputs = session.run(fetches=fetches, feed_dict=feed_dict, - options=self._run_options, run_metadata=run_metadata) + options=self._run_options, run_metadata=run_metadata) with open(os.path.join(self._log_dir, f'profile_{self._profile_counter}.json'), 'w') as ofile: tl = timeline.Timeline(run_metadata.step_stats) From 6eaa70dda69a12b2fcb503d1e6ce924e79d52763 Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 10:49:48 +0100 Subject: [PATCH 08/10] Refactoring profiler --- emloop_tensorflow/frozen_model.py | 4 ++-- emloop_tensorflow/model.py | 28 ++++++++-------------------- emloop_tensorflow/utils/profiler.py | 9 +++++---- 3 files changed, 15 insertions(+), 26 deletions(-) diff --git a/emloop_tensorflow/frozen_model.py b/emloop_tensorflow/frozen_model.py index 29b4d4c..9bbf773 100644 --- a/emloop_tensorflow/frozen_model.py +++ b/emloop_tensorflow/frozen_model.py @@ -68,7 +68,7 @@ def __init__(self, inputs: List[str], outputs: List[str], restore_from: str, log self._profile = profile if profile: - self._profiler = Profiler(log_dir, keep_profiles) + self._profiler = Profiler(log_dir, keep_profiles, self._session) def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrapper=None) -> Mapping[str, object]: """ @@ -94,7 +94,7 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap fetches.append(self._tower[output_name]) if self._profile: - outputs = self._profiler.run(self._session, fetches, feed_dict) + outputs = self._profiler.run(fetches=fetches, feed_dict=feed_dict) else: outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) diff --git a/emloop_tensorflow/model.py b/emloop_tensorflow/model.py index 7bab04a..b8a934b 100644 --- a/emloop_tensorflow/model.py +++ b/emloop_tensorflow/model.py @@ -99,17 +99,17 @@ def __init__(self, # pylint: disable=too-many-arguments self._towers = [GraphTower(i, inputs, outputs, loss_name) for i in range(n_gpus)] if n_gpus == 0: self._towers.append(GraphTower(-1, inputs, outputs, loss_name)) + logging.info('\tCreating TF model on %s GPU devices', n_gpus) + self._graph = tf.Graph() + self._session = self._create_session(session_config) if profile and not log_dir: raise ValueError('log_dir has to be specified with profile set to True') self._profile = profile if profile: - self._profiler = Profiler(log_dir, keep_profiles) + self._profiler = Profiler(log_dir, keep_profiles, self._session) - logging.info('\tCreating TF model on %s GPU devices', n_gpus) - self._graph = tf.Graph() - self._session = self._create_session(session_config) dependencies = [] with self._graph.as_default(): if restore_from is None: @@ -203,20 +203,6 @@ def session(self) -> tf.Session: """TF session object.""" return self._session - - def evaluate_graph(self, fetches: Dict, feed_dict: Dict): - """ - Evaluates tensorflow graph. - - :param fetches: names of output tensors - :param feed_dict: input tensors - """ - if self._profile: - return self._profiler.run(self._session, fetches, feed_dict) - - return self._session.run(fetches, feed_dict) - - def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrapper=None) -> Mapping[str, object]: """ Run the model with the given ``batch``. Update the trainable variables only if ``train`` is true. @@ -246,12 +232,14 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap for output_name in self.output_names: fetches.append(tower[output_name]) + run_fn = self._profiler.run if self._profile else self._session.run + # run the computational graph for one batch and allow buffering in the meanwhile if stream is not None: with stream.allow_buffering: - outputs = self.evaluate_graph(fetches, feed_dict) + outputs = run_fn(fetches, feed_dict) else: - outputs = self.evaluate_graph(fetches, feed_dict) + outputs = run_fn(fetches, feed_dict) if train: outputs = outputs[1:] diff --git a/emloop_tensorflow/utils/profiler.py b/emloop_tensorflow/utils/profiler.py index f39abd2..cc2f4ef 100644 --- a/emloop_tensorflow/utils/profiler.py +++ b/emloop_tensorflow/utils/profiler.py @@ -9,7 +9,7 @@ class Profiler: Profiles tensorflow graphs and saves the profiles. """ - def __init__(self, log_dir: str, keep_profiles: int): + def __init__(self, log_dir: str, keep_profiles: int, session: tf.Session): """ :param log_dir: directory where profiles will be saved :param keep_profiles: how many profiles are saved @@ -18,8 +18,9 @@ def __init__(self, log_dir: str, keep_profiles: int): self._profile_counter = 0 self._keep_profiles = keep_profiles self._run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + self._session = session - def run(self, session: tf.Session, fetches: Dict, feed_dict: Dict): + def run(self, fetches: Dict, feed_dict: Dict): """ Evaluates the tensorflow graph with profiling, saves profile and returns outputs. @@ -28,8 +29,8 @@ def run(self, session: tf.Session, fetches: Dict, feed_dict: Dict): :param feed_dict: input tensors """ run_metadata = tf.RunMetadata() - outputs = session.run(fetches=fetches, feed_dict=feed_dict, - options=self._run_options, run_metadata=run_metadata) + outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, + options=self._run_options, run_metadata=run_metadata) with open(os.path.join(self._log_dir, f'profile_{self._profile_counter}.json'), 'w') as ofile: tl = timeline.Timeline(run_metadata.step_stats) From 3bf6f10ade28e8b5664aee18548509024f2cef72 Mon Sep 17 00:00:00 2001 From: beda Date: Tue, 29 Jan 2019 11:32:02 +0100 Subject: [PATCH 09/10] Documentation --- docs/advanced.rst | 27 +++++++++++++++++++++++++++ docs/conf.py | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 docs/advanced.rst diff --git a/docs/advanced.rst b/docs/advanced.rst new file mode 100644 index 0000000..3a0c67f --- /dev/null +++ b/docs/advanced.rst @@ -0,0 +1,27 @@ +Profiling networks +------------------ +Profiling execution of tensorflow graph can be enabled with following setting: + +.. code-block:: yaml + :caption config.yaml + + model: + profile: True + keep_profiles: 10 + +This saves profiles of last 10 runs to the log directory (output directory). +Profiles are in JSON format and can be viewed using Google Chrome. +To view them go to address `chrome://tracing/` and load the json file. + +Gradient clipping +----------------- +For gradient clipping use following setting: + +.. code-block:: yaml + :caption config.yaml + + model: + clip_gradient: 5.0 + +This clips the absolute value of gradient to 5.0. +Note that the clipping is done to raw gradients before they are multiplied by learning rate or processed in other ways. diff --git a/docs/conf.py b/docs/conf.py index da8e229..b0b160d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,7 +13,7 @@ # General information about the project. project = 'emloop-tensorflow' copyright = '2018, Iterait a.s.' -author = 'Blazek Adam, Belohlavek Petr, Matzner Filip' +author = 'Blazek Adam, Belohlavek Petr, Matzner Filip, Bedrich Pisl' # The short X.Y version. version = '.'.join(pkg_resources.get_distribution("emloop-tensorflow").version.split('.')[:2]) @@ -37,6 +37,7 @@ ("Tutorial", "tutorial"), ("Model Regularization", "regularization"), ("Multi GPU models", "multi_gpu"), + ("Advanced", "advanced"), ("API Reference", "emloop_tensorflow/index"), ], }) From 45a674f8799427030f9712821ce97fe1132f1af4 Mon Sep 17 00:00:00 2001 From: beda Date: Fri, 8 Feb 2019 11:57:38 +0100 Subject: [PATCH 10/10] Add profiling test --- emloop_tensorflow/tests/model_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/emloop_tensorflow/tests/model_test.py b/emloop_tensorflow/tests/model_test.py index 66ad80a..35f4c97 100644 --- a/emloop_tensorflow/tests/model_test.py +++ b/emloop_tensorflow/tests/model_test.py @@ -419,6 +419,21 @@ def test_regularization(): regularized_model2.run(good_batch, train=True) +def test_profiling(tmpdir): + """Test whether profile is created.""" + model = TrainableModel(dataset=None, log_dir=tmpdir, **_IO, optimizer=_OPTIMIZER, profile=True, keep_profiles=10) + batch = {'input': [[1]*10], 'target': [[0]*10]} + + # test if one can train one model while the other remains intact + for _ in range(1000): + model.run(batch, train=True) + + for i in range(10): + assert path.exists(f"{tmpdir}/profile_{i}.json") + + assert not path.exists(f"{tmpdir}/profile_11.json") + + ####################### # TF Base Model Saver # #######################