From a54c57cc4ab098f75be8ac085b3fab298d71bd3f Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 15 Oct 2024 04:17:23 -0600 Subject: [PATCH] some keras cb docs and such Still TODO: - Deprecate everything TFKerasTrial related - Rip TFKerasTrial out of docs - Add Reference docs for DeterminedCallback - Add user guide or something for DeterminedCallback --- docs/model-dev-guide/create-experiment.rst | 23 +++ .../reference/experiment-config-reference.rst | 9 +- .../iris_tf_keras/adaptive.yaml | 14 +- .../computer_vision/iris_tf_keras/const.yaml | 16 +- .../iris_tf_keras/distributed.yaml | 14 +- .../iris_tf_keras/model_def.py | 105 ------------- .../computer_vision/iris_tf_keras/train.py | 138 ++++++++++++++++++ 7 files changed, 189 insertions(+), 130 deletions(-) delete mode 100644 examples/computer_vision/iris_tf_keras/model_def.py create mode 100644 examples/computer_vision/iris_tf_keras/train.py diff --git a/docs/model-dev-guide/create-experiment.rst b/docs/model-dev-guide/create-experiment.rst index 7c3018505766..7b57888e4e08 100644 --- a/docs/model-dev-guide/create-experiment.rst +++ b/docs/model-dev-guide/create-experiment.rst @@ -172,6 +172,29 @@ Use the ``-h`` option to get the latest usage: python3 -m determined.launch.deepspeed -h +TensorFlow Launcher +=================== + +Format: + +``determined.launch.tensorflow [--] SCRIPT...`` + +This launcher configures a ``TF_CONFIG`` environment variable suitable for whichever level of +TensorFlow distributed training is appropriate for the available training resources +(``MultiWorkerMirroredStrategy``, ``MirroredStrategy``, or the default strategy). + +Example: + +.. code:: bash + + python3 -m determined.launch.tensorflow -- python3 ./my_train.py --my-arg=value + +Use the ``-h`` option to get the latest usage: + +.. code:: bash + + python3 -m determined.launch.tensorflow -h + Legacy Launcher =============== diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst index 1dfd16c3b46e..3c17d2c11a11 100644 --- a/docs/reference/experiment-config-reference.rst +++ b/docs/reference/experiment-config-reference.rst @@ -122,30 +122,31 @@ field is empty. Arbitrary Script ---------------- -Required. An arbitrary entrypoint script name. +Required. An arbitrary entrypoint script with args. Example: .. code:: yaml - entrypoint: ./hello.sh + entrypoint: ./hello.sh args... Preconfigured Launch Module with Script --------------------------------------- -Required. The name of a preconfigured launch module and script name. +Required. The name of a preconfigured launch module and script with args. Example: .. code:: yaml - entrypoint: python3 -m (LAUNCH_MODULE) train.py + entrypoint: python3 -m (LAUNCH_MODULE) train.py args... ``LAUNCH_MODULE`` options: - Horovod (determined.launch.horovod) - PyTorch (determined.launch.torch_distributed) - Deepspeed (determined.launch.deepspeed) +- TensorFlow (determined.launch.tensorflow) Preconfigured Launch Module with Legacy Trial Definition -------------------------------------------------------- diff --git a/examples/computer_vision/iris_tf_keras/adaptive.yaml b/examples/computer_vision/iris_tf_keras/adaptive.yaml index 5c91087dac6c..92ecc09739e0 100644 --- a/examples/computer_vision/iris_tf_keras/adaptive.yaml +++ b/examples/computer_vision/iris_tf_keras/adaptive.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_adaptive_search -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: type: log @@ -25,7 +22,10 @@ searcher: name: adaptive_asha metric: val_categorical_accuracy smaller_is_better: false + # XXX fixme max_length: batches: 6400 max_trials: 512 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py diff --git a/examples/computer_vision/iris_tf_keras/const.yaml b/examples/computer_vision/iris_tf_keras/const.yaml index 595a754272c0..46f40fa0a0fe 100644 --- a/examples/computer_vision/iris_tf_keras/const.yaml +++ b/examples/computer_vision/iris_tf_keras/const.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_const -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 @@ -15,6 +12,11 @@ searcher: name: single metric: val_categorical_accuracy smaller_is_better: false + # XXX max_length: batches: 5000 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py +# XXX +max_restarts: 0 diff --git a/examples/computer_vision/iris_tf_keras/distributed.yaml b/examples/computer_vision/iris_tf_keras/distributed.yaml index 4dedcdec475e..4650538e86b0 100644 --- a/examples/computer_vision/iris_tf_keras/distributed.yaml +++ b/examples/computer_vision/iris_tf_keras/distributed.yaml @@ -1,11 +1,8 @@ name: iris_tf_keras_distributed -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + image: junk +# cpu: determinedai/tensorflow-ngc-dev:0736b6d +# gpu: determinedai/tensorflow-ngc-dev:0736b6d hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 @@ -17,6 +14,9 @@ searcher: name: single metric: val_categorical_accuracy smaller_is_better: false + # XXX fixme max_length: batches: 2500 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial +entrypoint: >- + python3 -m determined.launch.tensorflow -- + python3 train.py diff --git a/examples/computer_vision/iris_tf_keras/model_def.py b/examples/computer_vision/iris_tf_keras/model_def.py deleted file mode 100644 index 1624a7705dec..000000000000 --- a/examples/computer_vision/iris_tf_keras/model_def.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -This example shows how you could use Keras `Sequence`s and multiprocessing/multithreading for Keras -models in Determined. - -Useful References: - http://docs.determined.ai/latest/keras.html - https://keras.io/utils/ - -Based off of: https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data- - and-differences-between-eager-mode-on-and-off-9b4693e0b22 -""" -from typing import List - -import pandas as pd -import tensorflow as tf -from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.losses import categorical_crossentropy -from tensorflow.keras.metrics import categorical_accuracy -from tensorflow.keras.models import Model -from tensorflow.keras.optimizers.legacy import RMSprop -from tensorflow.keras.utils import to_categorical - -from determined import keras - -# Constants about the data set. -NUM_CLASSES = 3 - -# The first row of each data set is not a typical CSV header with column labels, but rather a -# dataset descriptor of the following format: -# -# ,,,, -# -# The remaining rows then contain observations, with the four features followed by label. The -# label values in the observation rows take on the values 0, 1, or 2 which correspond to the -# three species in the header. Define the columns explicitly here so that we can more easily -# separate features and labels below. -LABEL_HEADER = "Species" -DS_COLUMNS = [ - "SepalLength", - "SepalWidth", - "PetalLength", - "PetalWidth", - LABEL_HEADER, -] - - -class IrisTrial(keras.TFKerasTrial): - def __init__(self, context: keras.TFKerasTrialContext) -> None: - self.context = context - - def build_model(self) -> Model: - """ - Define model for iris classification. - - This is a simple model with one hidden layer to predict iris species (setosa, versicolor, or - virginica) based on four input features (length and width of sepals and petals). - """ - inputs = Input(shape=(4,)) - dense1 = Dense(self.context.get_hparam("layer1_dense_size"))(inputs) - dense2 = Dense(NUM_CLASSES, activation="softmax")(dense1) - - # Wrap the model. - model = self.context.wrap_model(Model(inputs=inputs, outputs=dense2)) - - # Create and wrap the optimizer. - optimizer = RMSprop( - lr=self.context.get_hparam("learning_rate"), - decay=self.context.get_hparam("learning_rate_decay"), - ) - optimizer = self.context.wrap_optimizer(optimizer) - - model.compile( - optimizer, - categorical_crossentropy, - [categorical_accuracy], - ) - - return model - - def keras_callbacks(self) -> List[tf.keras.callbacks.Callback]: - return [keras.callbacks.TensorBoard(update_freq="batch", profile_batch=0, histogram_freq=1)] - - def build_training_data_loader(self) -> keras.InputData: - # Ignore header line and read the training CSV observations into a pandas DataFrame. - train = pd.read_csv(self.context.get_data_config()["train_url"], names=DS_COLUMNS, header=0) - train_features, train_labels = train, train.pop(LABEL_HEADER) - - # Since we're building a classifier, convert the labels in the raw - # dataset (0, 1, or 2) to one-hot vector encodings that we'll to - # construct the Sequence data loaders that Determined expects. - train_labels_categorical = to_categorical(train_labels, num_classes=3) - - return train_features.values, train_labels_categorical - - def build_validation_data_loader(self) -> keras.InputData: - # Ignore header line and read the test CSV observations into a pandas DataFrame. - test = pd.read_csv(self.context.get_data_config()["test_url"], names=DS_COLUMNS, header=0) - test_features, test_labels = test, test.pop(LABEL_HEADER) - - # Since we're building a classifier, convert the labels in the raw - # dataset (0, 1, or 2) to one-hot vector encodings that we'll to - # construct the Sequence data loaders that Determined expects. - test_labels_categorical = to_categorical(test_labels, num_classes=3) - - return test_features.values, test_labels_categorical diff --git a/examples/computer_vision/iris_tf_keras/train.py b/examples/computer_vision/iris_tf_keras/train.py new file mode 100644 index 000000000000..595ad58c2fe5 --- /dev/null +++ b/examples/computer_vision/iris_tf_keras/train.py @@ -0,0 +1,138 @@ +""" +This example shows you how to train a model with Determined's keras callback. + +Useful References: + http://docs.determined.ai/latest/keras.html + https://keras.io/utils/ + +Based off of: https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data- + and-differences-between-eager-mode-on-and-off-9b4693e0b22 +""" +import logging +from typing import List + +import pandas as pd +from tensorflow.keras import layers, losses, metrics, models, utils +from tensorflow.keras.optimizers import legacy + +import determined as det +import determined.keras + +# Where to download data from. +TRAIN_DATA = "http://download.tensorflow.org/data/iris_training.csv" +TEST_DATA = "http://download.tensorflow.org/data/iris_test.csv" + +# Constants about the data set. +NUM_CLASSES = 3 + +# The first row of each data set is not a typical CSV header with column labels, but rather a +# dataset descriptor of the following format: +# +# ,,,, +# +# The remaining rows then contain observations, with the four features followed by label. The +# label values in the observation rows take on the values 0, 1, or 2 which correspond to the +# three species in the header. Define the columns explicitly here so that we can more easily +# separate features and labels below. +LABEL_HEADER = "Species" +DS_COLUMNS = [ + "SepalLength", + "SepalWidth", + "PetalLength", + "PetalWidth", + LABEL_HEADER, +] + + +def get_train_data(): + # Ignore header line and read the training CSV observations into a pandas DataFrame. + train = pd.read_csv(TRAIN_DATA, names=DS_COLUMNS, header=0) + train_features, train_labels = train, train.pop(LABEL_HEADER) + + # Since we're building a classifier, convert the labels in the raw + # dataset (0, 1, or 2) to one-hot vector encodings that we'll to + # construct the Sequence data loaders that Determined expects. + train_labels_categorical = utils.to_categorical(train_labels, num_classes=3) + + return train_features.values, train_labels_categorical + + +def get_test_data(): + test = pd.read_csv(TEST_DATA, names=DS_COLUMNS, header=0) + test_features, test_labels = test, test.pop(LABEL_HEADER) + test_labels_categorical = utils.to_categorical(test_labels, num_classes=3) + return test_features.values, test_labels_categorical + + +def main(core_context, strategy, checkpoint, continue_id, hparams): + # Download train and test data. + train_x, train_y = get_train_data() + validation_data = get_test_data() + + # Create and compile the model within a strategy's scope. + with strategy.scope(): + inputs = layers.Input(shape=(4,)) + dense1 = layers.Dense(hparams["layer1_dense_size"])(inputs) + dense2 = layers.Dense(NUM_CLASSES, activation="softmax")(dense1) + model = models.Model(inputs=inputs, outputs=dense2) + + optimizer = legacy.RMSprop( + lr=hparams["learning_rate"], + decay=hparams["learning_rate_decay"], + ) + + model.compile( + optimizer, + losses.categorical_crossentropy, + [metrics.categorical_accuracy], + ) + + # Create the main DeterminedCallback that connects training to the Determined cluster. + det_cb = det.keras.DeterminedCallback( + core_context, + checkpoint=checkpoint, + continue_id=continue_id, + # Iris epochs are very short, so we don't even bother to save checkpoints until we finish. + checkpoint_epochs=0, + ) + + # Also include a Determined-aware version of the Keras' TensorBoard callback. + tb_cb = det.keras.TensorBoard( + core_context, update_freq="batch", profile_batch=0, histogram_freq=1 + ) + + # Call model.fit() with our callbacks. + model.fit( + x=train_x, + y=train_y, + batch_size=hparams["global_batch_size"], + validation_data=validation_data, + epochs=100, + callbacks=[det_cb, tb_cb], + ) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) + + info = det.get_cluster_info() + if info and info.task_type == "TRIAL": + # We are a training a trial on-cluster. + continue_id = info.trial.trial_id + checkpoint = info.latest_checkpoint + # Use the hparams selected by the searcher for this trial. + hparams = info.trial.hparams + else: + # We are either in a notebook on-cluster or off-cluster entirely. + continue_id = "local-train-task" + checkpoint = None + # Pick some hparams for ourselves. + hparams = { + "learning_rate": 1.0e-4, + "learning_rate_decay": 1.0e-6, + "layer1_dense_size": 16, + "global_batch_size": 16, + } + + distributed, strategy = det.core.DistributedContext.from_tf_config() + with det.core.init(distributed=distributed) as core_context: + main(core_context, strategy, checkpoint, continue_id, hparams)