-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Profiling of neural networks #26
Changes from all commits
3e70801
778c47c
d2e5ea3
9f7bb55
44eaf7b
970da2b
e2ff8c8
6eaa70d
3bf6f10
45a674f
b75a200
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
Profiling networks | ||
------------------ | ||
Profiling execution of tensorflow graph can be enabled with following setting: | ||
|
||
.. code-block:: yaml | ||
:caption config.yaml | ||
|
||
model: | ||
profile: True | ||
keep_profiles: 10 | ||
|
||
This saves profiles of last 10 runs to the log directory (output directory). | ||
Profiles are in JSON format and can be viewed using Google Chrome. | ||
To view them go to address `chrome://tracing/` and load the json file. | ||
|
||
Gradient clipping | ||
----------------- | ||
For gradient clipping use following setting: | ||
|
||
.. code-block:: yaml | ||
:caption config.yaml | ||
|
||
model: | ||
clip_gradient: 5.0 | ||
|
||
This clips the absolute value of gradient to 5.0. | ||
Note that the clipping is done to raw gradients before they are multiplied by learning rate or processed in other ways. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
# General information about the project. | ||
project = 'emloop-tensorflow' | ||
copyright = '2018, Iterait a.s.' | ||
author = 'Blazek Adam, Belohlavek Petr, Matzner Filip' | ||
author = 'Blazek Adam, Belohlavek Petr, Matzner Filip, Bedrich Pisl' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice except that Beda's name and surname are in the wrong order. :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whatever |
||
|
||
# The short X.Y version. | ||
version = '.'.join(pkg_resources.get_distribution("emloop-tensorflow").version.split('.')[:2]) | ||
|
@@ -37,6 +37,7 @@ | |
("Tutorial", "tutorial"), | ||
("Model Regularization", "regularization"), | ||
("Multi GPU models", "multi_gpu"), | ||
("Advanced", "advanced"), | ||
("API Reference", "emloop_tensorflow/index"), | ||
], | ||
}) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
import logging | ||
from os import path | ||
from abc import ABCMeta | ||
from typing import List, Mapping, Optional | ||
from typing import List, Mapping, Optional, Dict | ||
from glob import glob | ||
|
||
import numpy as np | ||
|
@@ -11,7 +11,7 @@ | |
|
||
from .third_party.tensorflow.freeze_graph import freeze_graph | ||
from .third_party.tensorflow.average_gradients import average_gradients | ||
from .utils import create_optimizer | ||
from .utils import create_optimizer, Profiler | ||
from .graph_tower import GraphTower | ||
|
||
DEFAULT_LOSS_NAME = 'loss' | ||
|
@@ -44,8 +44,8 @@ def __init__(self, # pylint: disable=too-many-arguments | |
dataset: Optional[el.AbstractDataset], log_dir: Optional[str], inputs: List[str], outputs: List[str], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would make the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this in some way related to this pull request? |
||
session_config: Optional[dict]=None, n_gpus: int=0, restore_from: Optional[str]=None, | ||
optimizer=None, freeze=False, loss_name: str=DEFAULT_LOSS_NAME, monitor: Optional[str]=None, | ||
restore_fallback: Optional[str]=None, clip_gradient: Optional[float]=None, | ||
**kwargs): | ||
restore_fallback: Optional[str]=None, clip_gradient: Optional[float]=None, profile: bool=False, | ||
keep_profiles: int=5, **kwargs): | ||
""" | ||
Create new emloop trainable TensorFlow model. | ||
|
||
|
@@ -82,6 +82,8 @@ def __init__(self, # pylint: disable=too-many-arguments | |
:param monitor: monitor signal mean and variance of the tensors which names contain the specified value | ||
:param restore_fallback: ignored arg. (allows training from configs saved by emloop where it is added) | ||
:param clip_gradient: limit the absolute value of the gradient; set to None for no clipping | ||
:param profile: if true, profile the speed of model inference and save profiles to the specified log_dir | ||
:param keep_profiles: if true, profile the speed of model inference and save profiles to the specified log_dir | ||
:param kwargs: additional kwargs forwarded to :py:meth:`_create_model` | ||
""" | ||
super().__init__(dataset=dataset, log_dir=log_dir, restore_from=restore_from) | ||
|
@@ -97,10 +99,17 @@ def __init__(self, # pylint: disable=too-many-arguments | |
self._towers = [GraphTower(i, inputs, outputs, loss_name) for i in range(n_gpus)] | ||
if n_gpus == 0: | ||
self._towers.append(GraphTower(-1, inputs, outputs, loss_name)) | ||
|
||
logging.info('\tCreating TF model on %s GPU devices', n_gpus) | ||
self._graph = tf.Graph() | ||
self._session = self._create_session(session_config) | ||
|
||
if profile and not log_dir: | ||
raise ValueError('log_dir has to be specified with profile set to True') | ||
|
||
self._profile = profile | ||
if profile: | ||
self._profiler = Profiler(log_dir, keep_profiles, self._session) | ||
|
||
dependencies = [] | ||
with self._graph.as_default(): | ||
if restore_from is None: | ||
|
@@ -223,12 +232,14 @@ def run(self, batch: el.Batch, train: bool=False, stream: el.datasets.StreamWrap | |
for output_name in self.output_names: | ||
fetches.append(tower[output_name]) | ||
|
||
run_fn = self._profiler.run if self._profile else self._session.run | ||
|
||
# run the computational graph for one batch and allow buffering in the meanwhile | ||
if stream is not None: | ||
with stream.allow_buffering: | ||
outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) | ||
outputs = run_fn(fetches, feed_dict) | ||
else: | ||
outputs = self._session.run(fetches=fetches, feed_dict=feed_dict) | ||
outputs = run_fn(fetches, feed_dict) | ||
|
||
if train: | ||
outputs = outputs[1:] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import tensorflow as tf | ||
from tensorflow.python.client import timeline | ||
from typing import Dict | ||
import os | ||
|
||
|
||
class Profiler: | ||
""" | ||
Profiles tensorflow graphs and saves the profiles. | ||
""" | ||
|
||
def __init__(self, log_dir: str, keep_profiles: int, session: tf.Session): | ||
""" | ||
:param log_dir: directory where profiles will be saved | ||
:param keep_profiles: how many profiles are saved | ||
""" | ||
self._log_dir = log_dir | ||
self._profile_counter = 0 | ||
self._keep_profiles = keep_profiles | ||
self._run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) | ||
self._session = session | ||
|
||
def run(self, fetches: Dict, feed_dict: Dict): | ||
""" | ||
Evaluates the tensorflow graph with profiling, saves profile and returns outputs. | ||
|
||
:param session: tensorflow session | ||
:param fetches: names of output tensors | ||
:param feed_dict: input tensors | ||
""" | ||
run_metadata = tf.RunMetadata() | ||
outputs = self._session.run(fetches=fetches, feed_dict=feed_dict, | ||
options=self._run_options, run_metadata=run_metadata) | ||
|
||
with open(os.path.join(self._log_dir, f'profile_{self._profile_counter}.json'), 'w') as ofile: | ||
tl = timeline.Timeline(run_metadata.step_stats) | ||
ofile.write(tl.generate_chrome_trace_format()) | ||
|
||
self._profile_counter = (self._profile_counter + 1) % self._keep_profiles | ||
|
||
return outputs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for the docs.