From b7c500a2eb2a907e683a689723968fd9a3434dc1 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 15 Oct 2024 04:17:23 -0600 Subject: [PATCH] some keras cb docs and such Should I actually rip out all of the TFKerasTrial docs? Or just leave them marked as deprecated? --- .../apis-howto/api-core-ug-basic.rst | 4 +- .../api-guides/apis-howto/api-keras-ug.rst | 196 ++++++++++-------- docs/model-dev-guide/create-experiment.rst | 25 +++ docs/model-dev-guide/debug-models.rst | 10 +- .../dtrain/reproducibility.rst | 14 +- .../search-methods/hp-adaptive-asha.rst | 12 +- docs/model-dev-guide/profiling.rst | 6 +- .../reference/experiment-config-reference.rst | 13 +- .../reference/training/api-core-reference.rst | 7 - docs/reference/training/api-det-reference.rst | 7 + .../training/api-keras-reference.rst | 19 ++ docs/tools/tensorboard.rst | 19 +- .../iris_tf_keras/adaptive.yaml | 14 +- .../computer_vision/iris_tf_keras/const.yaml | 16 +- .../iris_tf_keras/distributed.yaml | 14 +- .../iris_tf_keras/model_def.py | 105 ---------- .../computer_vision/iris_tf_keras/train.py | 138 ++++++++++++ 17 files changed, 354 insertions(+), 265 deletions(-) delete mode 100644 examples/computer_vision/iris_tf_keras/model_def.py create mode 100644 examples/computer_vision/iris_tf_keras/train.py diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst index 62b0d87c7c80..5e27ce295424 100644 --- a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst +++ b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst @@ -25,8 +25,8 @@ the the following capabilities: - hyperparameter search - distributing work across multiple GPUs and/or nodes -These are the same features provided by the higher-level PyTorchTrial, DeepSpeedTrial, and -TFKerasTrial APIs: those APIs are implemented using the Core API. +These are the same features provided by the higher-level PyTorchTrial, and DeepSpeedTrial APIs: +those APIs are implemented using the Core API. This user guide shows you how to get started using the Core API. diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-keras-ug.rst b/docs/model-dev-guide/api-guides/apis-howto/api-keras-ug.rst index f871c9172ece..dc7601804740 100644 --- a/docs/model-dev-guide/api-guides/apis-howto/api-keras-ug.rst +++ b/docs/model-dev-guide/api-guides/apis-howto/api-keras-ug.rst @@ -7,7 +7,8 @@ .. meta:: :description: Learn how to use the Keras API to train a Keras model. This user guide walks you through loading your data, defining the model, customizing how the model.fit function is called, checkpointing, and callbacks. -In this guide, you'll learn how to use the Keras API. +In this guide, you'll learn how to use the Determined's keras.DeterminedCallback while training your +keras model. +---------------------------------------------------------------------+ | Visit the API reference | @@ -15,121 +16,146 @@ In this guide, you'll learn how to use the Keras API. | :ref:`keras-reference` | +---------------------------------------------------------------------+ -This document guides you through training a Keras model in Determined. You need to implement a trial -class that inherits :class:`~determined.keras.TFKerasTrial` and specify it as the entrypoint in the -:ref:`experiment-configuration`. +This document guides you through training a Keras model in Determined. You will need to update your +``model.fit()`` call to include a :class:`~determined.keras.DeterminedCallback` and submit it to +a Determined cluster. -To learn about this API, you can start by reading the trial definitions in the `Iris categorization -example +To learn about this API, you can start by reading the ``train.py`` script in the `Iris +categorization example `__. -*********** - Load Data -*********** +********************** + Configure Entrypoint +********************** -.. note:: +Determined requires you to launch training jobs by submitting them with an +:ref:`experiment-configuration` that tells the Determined master how to start your container. For +Keras training, you will always want to wrap your training script in Determined's :ref:`TensorFlow +launcher `: - Before loading data, visit :ref:`load-model-data` to understand how to work with different - sources of data. +.. code:: yaml -Loading data is done by defining :meth:`~determined.keras.TFKerasTrial.build_training_data_loader` -and :meth:`~determined.keras.TFKerasTrial.build_validation_data_loader` methods. Each should return -one of the following data types: + entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 my_train.py --my-arg... -#. A tuple ``(x, y)`` of NumPy arrays. x must be a NumPy array (or array-like), a list of arrays (in - case the model has multiple inputs), or a dict mapping input names to the corresponding array, if - the model has named inputs. y should be a numpy array. +Determined's TensorFlow launcher will automatically configure your training script with the right +``TF_CONFIG`` environment variable for for distributed training when distributed resources are +available, and will safely do nothing when they are not. -#. A tuple ``(x, y, sample_weights)`` of NumPy arrays. +**************************************************************** + Obtain a ``det.core.Context`` and a ``tf.distribute.Strategy`` +**************************************************************** -#. A ``tf.data.dataset`` returning a tuple of either (inputs, targets) or (inputs, targets, - sample_weights). +When using distributed training, TensorFlow requires you to create your ``Strategy`` early in the +process lifetime, before creating your model. -#. A ``keras.utils.Sequence`` returning a tuple of either (inputs, targets) or (inputs, targets, - sample weights). +Since you wrapped your training script in Determined's TensorFlow launcher, you can use Determined's +``core.DistributedContext.from_tf_config()`` helper, which will create both a suitable +``DistributedContext`` and ``Strategy`` for the training environment in your training job. Then you +can feed that ``DistributedContext`` to ``det.core.init()`` to get a ``core.Context``, and feed all +of that to your ``main()`` function (or equivalent) in your training script: -If using ``tf.data.Dataset``, users are required to wrap both their training and validation dataset -using :meth:`self.context.wrap_dataset `. This -wrapper is used to shard the dataset for distributed training. For optimal performance, users should -wrap a dataset immediately after creating it. +.. code:: python -.. include:: ../../../_shared/note-dtrain-learn-more.txt + if __name__ == "__main__": + distributed, strategy = det.core.DistributedContext.from_tf_config() + with det.core.init(distributed=distributed) as core_context: + main(core_context, strategy) -****************** - Define the Model -****************** +***************** + Build the Model +***************** -Users are required wrap their model prior to compiling it using :meth:`self.context.wrap_model -`. This is typically done inside -:meth:`~determined.keras.TFKerasTrial.build_model`. +Building a distributed-capable model is easy in keras; you just need to wrap your model building and +compiling in the ``strategy.scope()``. See the `TensorFlow documentation +`__ for more detail. -****************************************** - Customize Calling Model Fitting Function -****************************************** +.. code:: python -The :class:`~determined.keras.TFKerasTrial` interface allows the user to configure how ``model.fit`` -is called by calling :meth:`self.context.configure_fit() -`. + def main(core_context, strategy): + with strategy.scope(): + model = my_build_model() + model.compile(...) -*************** - Checkpointing -*************** +*********************************** + Create the ``DeterminedCallback`` +*********************************** -A checkpoint includes the model definition (Python source code), experiment configuration file, -network architecture, and the values of the model's parameters (i.e., weights) and hyperparameters. -When using a stateful optimizer during training, checkpoints will also include the state of the -optimizer (i.e., learning rate). You can also embed arbitrary metadata in checkpoints via a -:ref:`Python SDK `. +The :class:`~determined.keras.DeterminedCallback` will automatically integrate your training with +the Determined cluster. It reports train and test metrics, it reports progress, it saves +checkpoints, and it uploads them to checkpoint storage. It also handles preemption signals from the +Determined master (such as if you pause your experiment), shutting down training, then it restores +training from where it left off when the experiment continues. -TensorFlow Keras trials are checkpointed to a file named ``determined-keras-model.h5`` using -``tf.keras.models.save_model``. You can learn more from the `TF Keras docs -`__. +The ``DeterminedCallback`` has only three required inputs: + - the ``core_context`` you already created + - a ``checkpoint`` UUID to start training from, or ``None``. + - a ``continue_id`` used to decide how to treat the checkpoint. -*********** - Callbacks -*********** +In training jobs, a value for ``checkpoint`` should be obtained from +``det.get_cluster_info().latest_checkpoint``, which will automatically be populated with the latest +checkpoint saved by this trial, if there is one. -To execute arbitrary Python code during the lifecycle of a :class:`~determined.keras.TFKerasTrial`, -implement the :class:`determined.keras.callbacks.Callback` interface (an extension of the -``tf.keras.callbacks.Callbacks`` interface) and supply them to the -:class:`~determined.keras.TFKerasTrial` by implementing -:meth:`~determined.keras.TFKerasTrial.keras_callbacks`. +The ``continue_id`` helps the ``DeterminedCallback`` decide if the provided checkpoint represents +just the starting weights and training should begin at epoch=0, or if the checkpoint represents a +partially complete training that should pick up where it left off (at epoch > 0). The provided +``continue_id`` is saved along with every checkpoint, and when loading the starting checkpoint, if +the ``continue_id`` matches what was in the checkpoint, training state is also loaded from the +checkpoint. In training jobs, an easy value for ``continue_id`` is +``det.get_cluster_info.trial.trial_id``. -.. _keras-profiler: +See the reference for :class:`~determined.keras.DeterminedCallback` to see the optional parameters. + +.. code:: python + + info = det.get_cluster_info() + assert info and info.task_type == "TRIAL", "this example only runs as a trial on the cluster" + + det_cb = det.keras.DeterminedCallback( + core_context, + checkpoint=info.latest_checkpoint, + continue_id=info.trial.trial_id, + ) *********** - Profiling + Load Data *********** -Determined supports integration with the native TF Keras profiler. Results will automatically be -uploaded to the trial's TensorBoard path and can be viewed in the Determined Web UI. +Loading data is done normally, though additional considerations may arise if your existing data +loading code is not already container-ready. See :ref:`load-model-data` for more information. -The Keras profiler is configured as a callback in the :class:`~determined.keras.TFKerasTrial` class. -The :class:`determined.keras.callbacks.TensorBoard` callback is a thin wrapper around the native -Keras TensorBoard callback, ``tf.keras.callbacks.TensorBoard``. It overrides the ``log_dir`` -argument to set the Determined TensorBoard path, while other arguments are passed directly into -``tf.keras.callbacks.TensorBoard``. For a list of accepted arguments, consult the `official Keras -API documentation `_. +If you would like to take advantage Determined's distributed training, you may need to ensure that +your input data is properly sharded. See `TensorFlow documentation +`__ for details. + +.. include:: ../../../_shared/note-dtrain-learn-more.txt -The following code snippet will configure profiling for batches 5 and 10, and will compute weight -histograms every 1 epochs. +************************* + TensorBoard Integration +************************* + +Optionally, you can use Determined's :class:`~determined.keras.TensorBoard` callback, which extends +keras' ``TensorBoard`` callback with the ability to automatically upload metrics to Determined's +checkpoint storage. Determined's ``TensorBoard`` callback is configured identically to keras' +except it takes an additional ``core_context`` initial argument: .. code:: python - from determined import keras + tb_cb = det.keras.TensorBoard(core_context, ...) + +Then simply include it in your ``model.fit()`` as normal. - def keras_callbacks(self) -> List[tf.keras.callbacks.Callback]: - return [ - keras.callbacks.TensorBoard( - update_freq="batch", - profile_batch='5, 10', - histogram_freq=1, - ) - ] +************************* + Calling ``model.fit()`` +************************* -.. note:: +The only remaining step is to pass your callbacks to your ``model.fit()``: + +.. code:: python - Though specifying batches to profile with ``profile_batch`` is optional, profiling every batch - may cause a large amount of data to be uploaded to Tensorboard. This may result in long rendering - times for Tensorboard and memory issues. For long-running experiments, it is recommended to - configure profiling only on desired batches. + model.fit( + ..., + callbacks=[det_cb, tb_cb], + ) diff --git a/docs/model-dev-guide/create-experiment.rst b/docs/model-dev-guide/create-experiment.rst index 7c3018505766..bd62ac1b3200 100644 --- a/docs/model-dev-guide/create-experiment.rst +++ b/docs/model-dev-guide/create-experiment.rst @@ -172,6 +172,31 @@ Use the ``-h`` option to get the latest usage: python3 -m determined.launch.deepspeed -h +.. _launch-tensorflow: + +TensorFlow Launcher +=================== + +Format: + +``determined.launch.tensorflow [--] SCRIPT...`` + +This launcher configures a ``TF_CONFIG`` environment variable suitable for whichever level of +TensorFlow distributed training is appropriate for the available training resources +(``MultiWorkerMirroredStrategy``, ``MirroredStrategy``, or the default strategy). + +Example: + +.. code:: bash + + python3 -m determined.launch.tensorflow -- python3 ./my_train.py --my-arg=value + +Use the ``-h`` option to get the latest usage: + +.. code:: bash + + python3 -m determined.launch.tensorflow -h + Legacy Launcher =============== diff --git a/docs/model-dev-guide/debug-models.rst b/docs/model-dev-guide/debug-models.rst index 7594cdb08aa6..59473a6e0e05 100644 --- a/docs/model-dev-guide/debug-models.rst +++ b/docs/model-dev-guide/debug-models.rst @@ -70,9 +70,9 @@ for debugging. See :ref:`pytorch_trainer_ug` for usage details. #. Create simple tests to verify each ``Trial`` subclass method. Examples of what these tests might look like for - :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and :class:`~determined.keras.TFKerasTrial` - can be found in the :meth:`determined.TrialContext.from_config` documentation, but only you can - verify what is reasonable for your test. + :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` can be found in the + :meth:`determined.TrialContext.from_config` documentation, but only you can verify what is + reasonable for your test. #. Diagnose failures: @@ -385,8 +385,8 @@ step only applies if you have multiple GPUs and want to use distributed training consume too many resources and prevent the experiment from starting. - Determined is designed to control the details of distributed training for you. If you also try - to control those details, such as by calling ``tf.config.set_visible_devices()`` in a - :class:`~determined.keras.TFKerasTrial`, it is likely to cause issues. + to control those details, such as by calling ``tf.config.set_visible_devices()`` while + training a keras model, it is likely to cause issues. - Some classes of metrics must be specially calculated during distributed training. Most metrics, such as loss or accuracy, can be calculated piecemeal on each worker in a distributed diff --git a/docs/model-dev-guide/dtrain/reproducibility.rst b/docs/model-dev-guide/dtrain/reproducibility.rst index 6aad22e2d4b3..cc87e0fbd499 100644 --- a/docs/model-dev-guide/dtrain/reproducibility.rst +++ b/docs/model-dev-guide/dtrain/reproducibility.rst @@ -43,8 +43,8 @@ The experiment seed is used as a source of randomness for any hyperparameter sam The experiment seed is also used to generate a **trial seed** for every trial associated with the experiment. -In the ``Trial`` interface, the trial seed is accessible within the trial class using -``self.ctx.get_trial_seed()``. +When training on-cluster, the trial seed is accessible via +:class:`det.get_cluster_info().trial.trial_seed ` ******************* Coding Guidelines @@ -67,16 +67,12 @@ To achieve reproducible initial conditions in an experiment, please follow these ************************************** When doing CPU-only training with TensorFlow, it is possible to achieve floating-point -reproducibility throughout optimization. If using the :class:`~determined.keras.TFKerasTrial` API, -implement the optional :meth:`~determined.keras.TFKerasTrial.session_config` method to override the -default session configuration: +reproducibility throughout optimization: .. code:: python - def session_config(self) -> tf.ConfigProto: - return tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1 - ) + tf.config.threading.set_intra_op_parallelism_threads(1) + tf.config.threading.set_inter_op_parallelism_threads(1) .. warning:: diff --git a/docs/model-dev-guide/hyperparameter/search-methods/hp-adaptive-asha.rst b/docs/model-dev-guide/hyperparameter/search-methods/hp-adaptive-asha.rst index 6cb870eacc8b..f447dd598505 100644 --- a/docs/model-dev-guide/hyperparameter/search-methods/hp-adaptive-asha.rst +++ b/docs/model-dev-guide/hyperparameter/search-methods/hp-adaptive-asha.rst @@ -21,12 +21,12 @@ Search mode: Resource budget: - ``time_metric``, ``max_time``: The name of the "time" metric and the maximum value it will take - for a trial that survives to the end of the experiment (see :ref:`Training Units - `). Note that the searcher will expect this metric to - appear in validation metrics reported by the model. This quantity is domain-specific and should - roughly reflect the number of minibatches the model must be trained on for it to converge on the - data set. For users who would like to determine this number experimentally, train a model with - reasonable hyperparameters using the ``single`` search method. + for a trial that survives to the end of the experiment (see :ref:`Training Units + `). Note that the searcher will expect this metric to + appear in validation metrics reported by the model. This quantity is domain-specific and should + roughly reflect the number of minibatches the model must be trained on for it to converge on the + data set. For users who would like to determine this number experimentally, train a model with + reasonable hyperparameters using the ``single`` search method. - ``max_trials``: This indicates the total number of hyperparameter settings that will be evaluated in the experiment. Set ``max_trials`` to at least 500 to take advantage of speedups from diff --git a/docs/model-dev-guide/profiling.rst b/docs/model-dev-guide/profiling.rst index 277c9adf719f..f59c2452c2a2 100644 --- a/docs/model-dev-guide/profiling.rst +++ b/docs/model-dev-guide/profiling.rst @@ -82,9 +82,9 @@ training code. Identifying inefficiencies in individual training operations or s fine-grained context than generic system metrics can provide. For this level of profiling, Determined supports integration with training profilers that are native to their frameworks: -- PyTorch Profiler (:ref:`PyTorch API `) -- DeepSpeed Profiler (:ref:`DeepSpeed API `) -- TensorFlow Keras Profiler (:ref:`Keras API `) +- :ref:`PyTorch Profiler ` +- :ref:`DeepSpeed Profiler ` +- :class:`Keras TensorBoard callback ` Please see your framework's profiler documentation and the Determined Training API guide for usage details. diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst index 1dfd16c3b46e..f409cee59ea2 100644 --- a/docs/reference/experiment-config-reference.rst +++ b/docs/reference/experiment-config-reference.rst @@ -122,30 +122,31 @@ field is empty. Arbitrary Script ---------------- -Required. An arbitrary entrypoint script name. +Required. An arbitrary entrypoint script with args. Example: .. code:: yaml - entrypoint: ./hello.sh + entrypoint: ./hello.sh args... Preconfigured Launch Module with Script --------------------------------------- -Required. The name of a preconfigured launch module and script name. +Required. The name of a preconfigured launch module and script with args. Example: .. code:: yaml - entrypoint: python3 -m (LAUNCH_MODULE) train.py + entrypoint: python3 -m (LAUNCH_MODULE) train.py args... ``LAUNCH_MODULE`` options: - Horovod (determined.launch.horovod) - PyTorch (determined.launch.torch_distributed) - Deepspeed (determined.launch.deepspeed) +- TensorFlow (determined.launch.tensorflow) Preconfigured Launch Module with Legacy Trial Definition -------------------------------------------------------- @@ -304,7 +305,7 @@ Optional. Specifies the minimum frequency at which validation should be run for - :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and :class:`~determined.keras.TFKerasTrial`: If this is in the unit of epochs, - :ref:`records_per_epoch ` must be specified. + ``records_per_epoch`` must be specified. .. _experiment-config-perform-initial-validation: @@ -345,7 +346,7 @@ Optional. Specifies the minimum frequency for running checkpointing for each tri - :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and :class:`~determined.keras.TFKerasTrial`: If the unit is in epochs, you must also specify - :ref:`records_per_epoch `. + ``records_per_epoch``. ``checkpoint_policy`` ===================== diff --git a/docs/reference/training/api-core-reference.rst b/docs/reference/training/api-core-reference.rst index 5bb74671a432..49513066944b 100644 --- a/docs/reference/training/api-core-reference.rst +++ b/docs/reference/training/api-core-reference.rst @@ -99,10 +99,3 @@ ************************************* .. autoclass:: determined.core.TensorboardMode - -************************** - ``determined.TrialInfo`` -************************** - -.. autoclass:: determined.TrialInfo - :members: diff --git a/docs/reference/training/api-det-reference.rst b/docs/reference/training/api-det-reference.rst index 857d4bcaf1db..aa5c1e5d9d68 100644 --- a/docs/reference/training/api-det-reference.rst +++ b/docs/reference/training/api-det-reference.rst @@ -13,6 +13,13 @@ .. autoclass:: determined.ClusterInfo :members: +************************** + ``determined.TrialInfo`` +************************** + +.. autoclass:: determined.TrialInfo + :members: + ********************************* ``determined.import_from_path`` ********************************* diff --git a/docs/reference/training/api-keras-reference.rst b/docs/reference/training/api-keras-reference.rst index b24db8644e0f..1c656aca78ac 100644 --- a/docs/reference/training/api-keras-reference.rst +++ b/docs/reference/training/api-keras-reference.rst @@ -10,6 +10,25 @@ | :ref:`api-keras-ug` | +-------------------------------------------------+ +***************************************** + ``determined.keras.DeterminedCallback`` +***************************************** + +.. autoclass:: determined.keras.DeterminedCallback + :members: save_model, load_model + :member-order: bysource + :special-members: __init__ + +********************************** + ``determined.keras.TensorBoard`` +********************************** + +.. autoclass:: determined.keras.TensorBoard + +################# + Deprecated APIs +################# + *********************************** ``determined.keras.TFKerasTrial`` *********************************** diff --git a/docs/tools/tensorboard.rst b/docs/tools/tensorboard.rst index 7263b66f0a4c..538bf106a9ab 100644 --- a/docs/tools/tensorboard.rst +++ b/docs/tools/tensorboard.rst @@ -138,20 +138,8 @@ To configure TensorBoard for a specific framework, follow the examples below: TensorFlow Keras ================ -For models using :class:`~determined.keras.TFKerasTrial`, add a -:class:`determined.keras.callabacks.TensorBoard` callback to your trial class: - -.. code:: python - - from determined.keras import TFKerasTrial - from determined.keras.callbacks import TensorBoard - - - class MyModel(TFKerasTrial): - ... - - def keras_callbacks(self): - return [TensorBoard()] +For models using :class:`~determined.keras.DeterminedCallback`, include a +:class:`determined.keras.TensorBoard` callback in your ``model.fit()`` call.: PyTorch ======= @@ -196,10 +184,9 @@ Any additional TFEvent files that are written to the appropriate path during tra to TensorBoard. The appropriate path varies by worker rank and can be obtained by one of the following functions: -- For CoreAPI users: :func:`~determined.core.TrainContext.get_tensorboard_path` +- For CoreAPI and keras users: :func:`~determined.core.TrainContext.get_tensorboard_path` - For PyTorchTrial users: :func:`~determined.pytorch.PyTorchTrialContext.get_tensorboard_path` - For DeepSpeedTrial users: :func:`~determined.pytorch.deepspeed.DeepSpeedTrialContext.get_tensorboard_path` -- For TFKerasTrial users: :func:`~determined.keras.TFKerasTrialContext.get_tensorboard_path` For more details and examples, refer to the :ref:`TensorBoard How-To Guide `. diff --git a/examples/computer_vision/iris_tf_keras/adaptive.yaml b/examples/computer_vision/iris_tf_keras/adaptive.yaml index 5c91087dac6c..92ecc09739e0 100644 --- a/examples/computer_vision/iris_tf_keras/adaptive.yaml +++ b/examples/computer_vision/iris_tf_keras/adaptive.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_adaptive_search -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: type: log @@ -25,7 +22,10 @@ searcher: name: adaptive_asha metric: val_categorical_accuracy smaller_is_better: false + # XXX fixme max_length: batches: 6400 max_trials: 512 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py diff --git a/examples/computer_vision/iris_tf_keras/const.yaml b/examples/computer_vision/iris_tf_keras/const.yaml index 595a754272c0..46f40fa0a0fe 100644 --- a/examples/computer_vision/iris_tf_keras/const.yaml +++ b/examples/computer_vision/iris_tf_keras/const.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_const -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 @@ -15,6 +12,11 @@ searcher: name: single metric: val_categorical_accuracy smaller_is_better: false + # XXX max_length: batches: 5000 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py +# XXX +max_restarts: 0 diff --git a/examples/computer_vision/iris_tf_keras/distributed.yaml b/examples/computer_vision/iris_tf_keras/distributed.yaml index 4dedcdec475e..4650538e86b0 100644 --- a/examples/computer_vision/iris_tf_keras/distributed.yaml +++ b/examples/computer_vision/iris_tf_keras/distributed.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_distributed -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 @@ -17,6 +14,9 @@ searcher: name: single metric: val_categorical_accuracy smaller_is_better: false + # XXX fixme max_length: batches: 2500 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py diff --git a/examples/computer_vision/iris_tf_keras/model_def.py b/examples/computer_vision/iris_tf_keras/model_def.py deleted file mode 100644 index 1624a7705dec..000000000000 --- a/examples/computer_vision/iris_tf_keras/model_def.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -This example shows how you could use Keras `Sequence`s and multiprocessing/multithreading for Keras -models in Determined. - -Useful References: - http://docs.determined.ai/latest/keras.html - https://keras.io/utils/ - -Based off of: https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data- - and-differences-between-eager-mode-on-and-off-9b4693e0b22 -""" -from typing import List - -import pandas as pd -import tensorflow as tf -from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.losses import categorical_crossentropy -from tensorflow.keras.metrics import categorical_accuracy -from tensorflow.keras.models import Model -from tensorflow.keras.optimizers.legacy import RMSprop -from tensorflow.keras.utils import to_categorical - -from determined import keras - -# Constants about the data set. -NUM_CLASSES = 3 - -# The first row of each data set is not a typical CSV header with column labels, but rather a -# dataset descriptor of the following format: -# -# ,,,, -# -# The remaining rows then contain observations, with the four features followed by label. The -# label values in the observation rows take on the values 0, 1, or 2 which correspond to the -# three species in the header. Define the columns explicitly here so that we can more easily -# separate features and labels below. -LABEL_HEADER = "Species" -DS_COLUMNS = [ - "SepalLength", - "SepalWidth", - "PetalLength", - "PetalWidth", - LABEL_HEADER, -] - - -class IrisTrial(keras.TFKerasTrial): - def __init__(self, context: keras.TFKerasTrialContext) -> None: - self.context = context - - def build_model(self) -> Model: - """ - Define model for iris classification. - - This is a simple model with one hidden layer to predict iris species (setosa, versicolor, or - virginica) based on four input features (length and width of sepals and petals). - """ - inputs = Input(shape=(4,)) - dense1 = Dense(self.context.get_hparam("layer1_dense_size"))(inputs) - dense2 = Dense(NUM_CLASSES, activation="softmax")(dense1) - - # Wrap the model. - model = self.context.wrap_model(Model(inputs=inputs, outputs=dense2)) - - # Create and wrap the optimizer. - optimizer = RMSprop( - lr=self.context.get_hparam("learning_rate"), - decay=self.context.get_hparam("learning_rate_decay"), - ) - optimizer = self.context.wrap_optimizer(optimizer) - - model.compile( - optimizer, - categorical_crossentropy, - [categorical_accuracy], - ) - - return model - - def keras_callbacks(self) -> List[tf.keras.callbacks.Callback]: - return [keras.callbacks.TensorBoard(update_freq="batch", profile_batch=0, histogram_freq=1)] - - def build_training_data_loader(self) -> keras.InputData: - # Ignore header line and read the training CSV observations into a pandas DataFrame. - train = pd.read_csv(self.context.get_data_config()["train_url"], names=DS_COLUMNS, header=0) - train_features, train_labels = train, train.pop(LABEL_HEADER) - - # Since we're building a classifier, convert the labels in the raw - # dataset (0, 1, or 2) to one-hot vector encodings that we'll to - # construct the Sequence data loaders that Determined expects. - train_labels_categorical = to_categorical(train_labels, num_classes=3) - - return train_features.values, train_labels_categorical - - def build_validation_data_loader(self) -> keras.InputData: - # Ignore header line and read the test CSV observations into a pandas DataFrame. - test = pd.read_csv(self.context.get_data_config()["test_url"], names=DS_COLUMNS, header=0) - test_features, test_labels = test, test.pop(LABEL_HEADER) - - # Since we're building a classifier, convert the labels in the raw - # dataset (0, 1, or 2) to one-hot vector encodings that we'll to - # construct the Sequence data loaders that Determined expects. - test_labels_categorical = to_categorical(test_labels, num_classes=3) - - return test_features.values, test_labels_categorical diff --git a/examples/computer_vision/iris_tf_keras/train.py b/examples/computer_vision/iris_tf_keras/train.py new file mode 100644 index 000000000000..933276440796 --- /dev/null +++ b/examples/computer_vision/iris_tf_keras/train.py @@ -0,0 +1,138 @@ +""" +This example shows you how to train a model with Determined's keras callback. + +Useful References: + https://docs.determined.ai/latest/reference/training/api-keras-reference.html + https://keras.io/api/ + +Based off of: https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data- + and-differences-between-eager-mode-on-and-off-9b4693e0b22 +""" +import logging +from typing import List + +import pandas as pd +from tensorflow.keras import layers, losses, metrics, models, utils +from tensorflow.keras.optimizers import legacy + +import determined as det +import determined.keras + +# Where to download data from. +TRAIN_DATA = "http://download.tensorflow.org/data/iris_training.csv" +TEST_DATA = "http://download.tensorflow.org/data/iris_test.csv" + +# Constants about the data set. +NUM_CLASSES = 3 + +# The first row of each data set is not a typical CSV header with column labels, but rather a +# dataset descriptor of the following format: +# +# ,,,, +# +# The remaining rows then contain observations, with the four features followed by label. The +# label values in the observation rows take on the values 0, 1, or 2 which correspond to the +# three species in the header. Define the columns explicitly here so that we can more easily +# separate features and labels below. +LABEL_HEADER = "Species" +DS_COLUMNS = [ + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth", + LABEL_HEADER, +] + + +def get_train_data(): + # Ignore header line and read the training CSV observations into a pandas DataFrame. + train = pd.read_csv(TRAIN_DATA, names=DS_COLUMNS, header=0) + train_features, train_labels = train, train.pop(LABEL_HEADER) + + # Since we're building a classifier, convert the labels in the raw + # dataset (0, 1, or 2) to one-hot vector encodings that we'll to + # construct the Sequence data loaders that Determined expects. + train_labels_categorical = utils.to_categorical(train_labels, num_classes=3) + + return train_features.values, train_labels_categorical + + +def get_test_data(): + test = pd.read_csv(TEST_DATA, names=DS_COLUMNS, header=0) + test_features, test_labels = test, test.pop(LABEL_HEADER) + test_labels_categorical = utils.to_categorical(test_labels, num_classes=3) + return test_features.values, test_labels_categorical + + +def main(core_context, strategy, checkpoint, continue_id, hparams): + # Download train and test data. + train_x, train_y = get_train_data() + validation_data = get_test_data() + + # Create and compile the model within a strategy's scope. + with strategy.scope(): + inputs = layers.Input(shape=(4,)) + dense1 = layers.Dense(hparams["layer1_dense_size"])(inputs) + dense2 = layers.Dense(NUM_CLASSES, activation="softmax")(dense1) + model = models.Model(inputs=inputs, outputs=dense2) + + optimizer = legacy.RMSprop( + lr=hparams["learning_rate"], + decay=hparams["learning_rate_decay"], + ) + + model.compile( + optimizer, + losses.categorical_crossentropy, + [metrics.categorical_accuracy], + ) + + # Create the main DeterminedCallback that connects training to the Determined cluster. + det_cb = det.keras.DeterminedCallback( + core_context, + checkpoint=checkpoint, + continue_id=continue_id, + # Iris epochs are very short, so we don't even bother to save checkpoints until we finish. + checkpoint_epochs=0, + ) + + # Also include a Determined-aware version of the Keras' TensorBoard callback. + tb_cb = det.keras.TensorBoard( + core_context, update_freq="batch", profile_batch=0, histogram_freq=1 + ) + + # Call model.fit() with our callbacks. + model.fit( + x=train_x, + y=train_y, + batch_size=hparams["global_batch_size"], + validation_data=validation_data, + epochs=100, + callbacks=[det_cb, tb_cb], + ) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) + + info = det.get_cluster_info() + if info and info.task_type == "TRIAL": + # We are a training a trial on-cluster. + continue_id = info.trial.trial_id + checkpoint = info.latest_checkpoint + # Use the hparams selected by the searcher for this trial. + hparams = info.trial.hparams + else: + # We are either in a notebook on-cluster or off-cluster entirely. + continue_id = "local-train-task" + checkpoint = None + # Pick some hparams for ourselves. + hparams = { + "learning_rate": 1.0e-4, + "learning_rate_decay": 1.0e-6, + "layer1_dense_size": 16, + "global_batch_size": 16, + } + + distributed, strategy = det.core.DistributedContext.from_tf_config() + with det.core.init(distributed=distributed) as core_context: + main(core_context, strategy, checkpoint, continue_id, hparams)