diff --git a/doc/source/_static/css/custom.css b/doc/source/_static/css/custom.css index 5f62d82e16b7..8f0bfe15d567 100644 --- a/doc/source/_static/css/custom.css +++ b/doc/source/_static/css/custom.css @@ -464,7 +464,7 @@ footer { color: var(--pst-color-text-secondary); } -/* override default colors used in the Sphnix theme */ +/* override default colors used in the Sphinx theme */ :root { --tabs-color-label-active: #0475DE; --tabs-color-label-hover: #0475DE; @@ -486,3 +486,8 @@ footer { :root { --blue: #0475DE; } + +/* Make the hover color of tag/gallery buttons differ from "active" */ +.tag.btn-outline-primary:hover { + background-color: rgba(20, 99, 208, 0.62) !important; +} \ No newline at end of file diff --git a/doc/source/_static/js/tags.js b/doc/source/_static/js/tags.js index dc271cb617ce..a627898d270b 100644 --- a/doc/source/_static/js/tags.js +++ b/doc/source/_static/js/tags.js @@ -23,22 +23,26 @@ window.addEventListener('load', () => { .then(panelTagMap => { for (const panelId in panelTagMap) { - let element = document.getElementsByClassName(panelId)[0]; - - // For each panel, attach data tags to the 4-th parent of the panel, - // which is the "div" element that we can later toggle. - // Sphinx Panels is too inflexible to allow us to attach data tags - // directly to the container. - for (let i = 0; i < 4; i++) { - if (element.parentNode) { - element = element.parentElement; - element.setAttribute('data-tags', panelTagMap[panelId]); - } - else { - console.log(panelId + ' has no parent element,' + - 'please check if the panel has been tagged correctly.'); + try { // panel might not be in _this_ gallery + let element = document.getElementsByClassName(panelId)[0]; + + // For each panel, attach data tags to the 4-th parent of the panel, + // which is the "div" element that we can later toggle. + // Sphinx Panels is too inflexible to allow us to attach data tags + // directly to the container. + for (let i = 0; i < 4; i++) { + if (element.parentNode) { + element = element.parentElement; + element.setAttribute('data-tags', panelTagMap[panelId]); + } + else { + console.log(panelId + ' has no parent element,' + + 'please check if the panel has been tagged correctly.'); + } } } + // simply skip panels not in this gallery + catch(err) {} } const allButton = document.getElementById('allButton') diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index bbc60b28255d..f03c2a430fe7 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -74,15 +74,40 @@ parts: title: Ray Train sections: - file: train/getting-started + title: "Getting Started" - file: train/key-concepts + title: "Key Concepts" - file: train/user-guides + title: "User Guides" sections: - file: train/config_guide + title: "Configuring Ray Train" - file: train/dl_guide + title: "Deep Learning Guide" - file: train/gbdt + title: "XGBoost/LightGBM guide" - file: train/architecture - - file: train/faq + title: "Ray Train Architecture" - file: train/examples + title: "Examples" + sections: + - file: train/examples/pytorch/torch_fashion_mnist_example + title: "PyTorch Fashion MNIST Example" + - file: train/examples/transformers/transformers_example + title: "HF Transformers Example" + - file: train/examples/tf/tensorflow_mnist_example + title: "TensorFlow MNIST Example" + - file: train/examples/horovod/horovod_example + title: "Horovod Example" + - file: train/examples/mlflow_fashion_mnist_example + title: "MLflow Callback Example" + - file: train/examples/tf/tune_tensorflow_mnist_example + title: "Tune & TensorFlow Example" + - file: train/examples/pytorch/tune_cifar_torch_pbt_example + title: "Tune & PyTorch Example" + - file: train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example + title: "Torch Data Prefetching Benchmark" + - file: train/faq - file: train/api - file: tune/index diff --git a/doc/source/conf.py b/doc/source/conf.py index 20c8c89ddb7b..95750148649f 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -354,10 +354,19 @@ "xgboostExample": "Classification,XGBoost,Training,Preprocessing,Prediction", "timeSeriesAutoML": "Regression,Sklearn,Tuning", "AIRExamples": "Regression,Classification,Training,Tuning,Prediction," - "Preprocessing,Serving,PyTorch,TensorFlow,XGBoost,LightGBM,Sklearn" + "Preprocessing,Serving,PyTorch,TensorFlow,XGBoost,LightGBM,Sklearn", + # Tags for Ray Train examples gallery + "trainTorchFashionMnist": "PyTorch,Training", + "trainTransformers": "PyTorch,Training,HuggingFace", + "trainTensorflowMnist": "TensorFlow,Training", + "trainHorovod": "Horovod, PyTorch,Training", + "trainMlflow": "MLflow,Training", + "trainTuneTensorflow": "TensorFlow,Training,Tuning", + "trainTunePyTorch": "PyTorch,Training,Tuning", + "trainBenchmark": "PyTorch,Training" # TODO add and integrate tags for other libraries. # Tune has a proper example library - # Train, Serve, RLlib and AIR could use one. + # Serve, RLlib and AIR could use one. } # Create file with tag mappings for tags.js to use. diff --git a/doc/source/images/tf_logo.png b/doc/source/images/tf_logo.png new file mode 100644 index 000000000000..6bc2536aad09 Binary files /dev/null and b/doc/source/images/tf_logo.png differ diff --git a/doc/source/train/api.rst b/doc/source/train/api.rst index 424aedfd0acf..61ea64fefa31 100644 --- a/doc/source/train/api.rst +++ b/doc/source/train/api.rst @@ -2,6 +2,7 @@ Ray Train API ============= + This page covers framework specific integrations with Ray Train and Ray Train Developer APIs. For core Ray AIR APIs, take a look at the :ref:`AIR Trainer package reference `. @@ -14,12 +15,17 @@ Trainer and Predictor Integrations XGBoost ~~~~~~~ +``XGBoostTrainer`` +****************** + .. autoclass:: ray.train.xgboost.XGBoostTrainer :members: :show-inheritance: .. automethod:: __init__ +``XGBoostPredictor`` +******************** .. automodule:: ray.train.xgboost :members: @@ -29,12 +35,18 @@ XGBoost LightGBM ~~~~~~~~ +``LightGBMTrainer`` +******************* + .. autoclass:: ray.train.lightgbm.LightGBMTrainer :members: :show-inheritance: .. automethod:: __init__ +``LightGBMPredictor`` +********************* + .. automodule:: ray.train.lightgbm :members: @@ -44,12 +56,17 @@ LightGBM TensorFlow ~~~~~~~~~~ +``TensorflowTrainer`` +********************* + .. autoclass:: ray.train.tensorflow.TensorflowTrainer :members: :show-inheritance: .. automethod:: __init__ +``TensorflowPredictor`` and ``TensorflowCheckpoint`` +**************************************************** .. automodule:: ray.train.tensorflow :members: @@ -59,6 +76,9 @@ TensorFlow PyTorch ~~~~~~~ +``TorchTrainer`` +**************** + .. autoclass:: ray.train.torch.TorchTrainer :members: :show-inheritance: @@ -66,6 +86,9 @@ PyTorch .. automethod:: __init__ +``TorchPredictor`` +****************** + .. automodule:: ray.train.torch :members: :exclude-members: TorchTrainer @@ -74,12 +97,17 @@ PyTorch Horovod ~~~~~~~ +``HorovodTrainer`` +****************** + .. autoclass:: ray.train.horovod.HorovodTrainer :members: :show-inheritance: .. automethod:: __init__ +``HorovodConfig`` +***************** .. automodule:: ray.train.horovod :members: @@ -89,12 +117,17 @@ Horovod HuggingFace ~~~~~~~~~~~ +``HuggingFaceTrainer`` +********************** + .. autoclass:: ray.train.huggingface.HuggingFaceTrainer :members: :show-inheritance: .. automethod:: __init__ +``HuggingFacePredictor`` and ``HuggingFaceCheckpoint`` +****************************************************** .. automodule:: ray.train.huggingface :members: @@ -104,12 +137,17 @@ HuggingFace Scikit-Learn ~~~~~~~~~~~~ +``SklearnTrainer`` +****************** + .. autoclass:: ray.train.sklearn.SklearnTrainer :members: :show-inheritance: .. automethod:: __init__ +``SklearnPredictor`` and ``SklearnCheckpoint`` +********************************************** .. automodule:: ray.train.sklearn :members: @@ -119,6 +157,9 @@ Scikit-Learn Mosaic ~~~~~~ +``MosaicTrainer`` +***************** + .. autoclass:: ray.train.mosaic.MosaicTrainer :members: :show-inheritance: @@ -132,16 +173,32 @@ Mosaic :show-inheritance: -Reinforcement Learning (RLlib) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Reinforcement Learning with RLlib +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``RLTrainer`` +************* + +.. autoclass:: ray.train.rl.RLTrainer + :members: + :show-inheritance: + + .. automethod:: __init__ + +``RLPredictor`` and ``RLCheckpoint`` +************************************ .. automodule:: ray.train.rl :members: + :exclude-members: RLTrainer :show-inheritance: Base Classes (Developer APIs) ----------------------------- + +.. _train-base-trainer: + .. autoclass:: ray.train.trainer.BaseTrainer :members: :noindex: @@ -165,8 +222,12 @@ Base Classes (Developer APIs) .. automethod:: __init__ :noindex: +.. _train-backend: + .. autoclass:: ray.train.backend.Backend :members: +.. _train-backend-config: + .. autoclass:: ray.train.backend.BackendConfig :members: diff --git a/doc/source/train/architecture.rst b/doc/source/train/architecture.rst index 2c44247fe843..733f86c6cfbe 100644 --- a/doc/source/train/architecture.rst +++ b/doc/source/train/architecture.rst @@ -1,44 +1,66 @@ .. _train-arch: +.. TODO: the diagram and some of the components (in the given context) are outdated. + Make sure to fix this. + Ray Train Architecture ====================== -A diagram of the Ray Train architecture is provided below. +The process of training models with Ray Train consists of several components. +First, depending on the training framework you want to work with, you will have +to provide a so-called ``Trainer`` that manages the training process. +For instance, to use a PyTorch model, you use a ``TorchTrainer``. +The actual training load is distributed among workers on a cluster that belong +to a ``WorkerGroup``. +Each framework has its specific communication protocols and exchange formats, +which is why Ray Train provides ``Backend`` implementations (e.g. ``TorchBackend``) +that can be used to run the training process using a ``BackendExecutor``. + +Here's a visual overview of the architecture components of Ray Train: .. image:: train-arch.svg :width: 70% :align: center +Below we discuss each component in a bit more detail. Trainer ------- -The Trainer is the main class that is exposed in the Ray Train API that users will interact with. +Trainers are your main entry point to the Ray Train API. +Train provides a :ref:`BaseTrainer`, and +many framework-specific Trainers inherit from the derived ``DataParallelTrainer`` +(like TensorFlow or Torch) and ``GBDTTrainer`` (like XGBoost or LightGBM). +Defining an actual Trainer, such as ``TorchTrainer`` works as follows: -* The user will pass in a *function* which defines the training logic. +* You pass in a *function* to the Trainer which defines the training logic. * The Trainer will create an :ref:`Executor ` to run the distributed training. -* The Trainer will handle callbacks based on the results from the BackendExecutor. +* The Trainer will handle callbacks based on the results from the executor. -.. _train-arch-executor: +.. _train-arch-backend: -Executor --------- +Backend +------- -The executor is an interface which handles execution of distributed training. +Backends are used to initialize and manage framework-specific communication protocols. +Each training library (Torch, Horovod, TensorFlow, etc.) has a separate backend +and takes specific configuration values defined in a :ref:`BackendConfig`. +Each backend comes with a ``BackendExecutor`` that is used to run the training process. -* The executor will handle the creation of an actor group and will be initialized in conjunction with a backend. -* Worker resources, number of workers, and placement strategy will be passed to the Worker Group. +.. _train-arch-executor: +Executor +-------- -Backend -------- +The executor is an interface (``BackendExecutor``) that executes distributed training. +It handles the creation of a group of workers (using :ref:`Ray Actors`) +and is initialized with a :ref:`backend`. +The executor passes all required resources, the number of workers, and information about +worker placement to the ``WorkerGroup``. -A backend is used in conjunction with the executor to initialize and manage framework-specific communication protocols. -Each communication library (Torch, Horovod, TensorFlow, etc.) will have a separate backend and will take a specific configuration value. WorkerGroup ----------- The WorkerGroup is a generic utility class for managing a group of Ray Actors. - -* This is similar in concept to Fiber's `Ring `_. +This is similar in concept to Fiber's `Ring `_. diff --git a/doc/source/train/config_guide.rst b/doc/source/train/config_guide.rst index d152e958beb6..0b6c0381ddcb 100644 --- a/doc/source/train/config_guide.rst +++ b/doc/source/train/config_guide.rst @@ -1,13 +1,13 @@ .. _train-config: -Configurations User Guide -========================= +Ray Train Configuration User Guide +================================== The following overviews how to configure scale-out, run options, and fault-tolerance for Train. For more details on how to configure data ingest, also refer to :ref:`air-ingest`. -Scaling configuration (``ScalingConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Scaling Configurations in Train (``ScalingConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The scaling configuration specifies distributed training properties like the number of workers or the resources per worker. @@ -22,8 +22,8 @@ The properties of the scaling configuration are :ref:`tunable `. :start-after: __failure_config_start__ :end-before: __failure_config_end__ -Sync configuration (``SyncConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Sync configurations in Train (``SyncConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The sync configuration specifies how to synchronize checkpoints between the Ray cluster and remote storage. @@ -69,8 +69,8 @@ are :ref:`not tunable `. :end-before: __sync_config_end__ -Checkpoint configuration (``CheckpointConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Checkpoint configurations in Train (``CheckpointConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The checkpoint configuration specifies how often to checkpoint training state and how many checkpoints to keep. diff --git a/doc/source/train/dl_guide.rst b/doc/source/train/dl_guide.rst index 9f0e208a543f..3fa2e6d812a3 100644 --- a/doc/source/train/dl_guide.rst +++ b/doc/source/train/dl_guide.rst @@ -1,7 +1,7 @@ .. _train-dl-guide: -Deep Learning User Guide -======================== +Distributed Deep Learning with Ray Train User Guide +=================================================== This guide explains how to use Train to scale PyTorch, TensorFlow and Horovod. @@ -16,8 +16,8 @@ In this guide, we cover examples for the following use cases: .. _train-backends: -Backends --------- +Using Deep Learning Frameworks as Backends +------------------------------------------ Ray Train provides a thin API around different backend frameworks for distributed deep learning. At the moment, Ray Train allows you to perform @@ -38,15 +38,15 @@ training with: .. _train-porting-code: -Porting code to Ray Train -------------------------- +Porting code from PyTorch, TensorFlow, or Horovod to Ray Train +-------------------------------------------------------------- The following instructions assume you have a training function that can already be run on a single worker for one of the supported :ref:`backend ` frameworks. -Update training function -~~~~~~~~~~~~~~~~~~~~~~~~ +Updating your training function +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ First, you'll want to update your training function to support distributed training. @@ -173,8 +173,8 @@ training. To onboard onto Horovod, please visit the `Horovod guide `_. -Create Ray Train Trainer -~~~~~~~~~~~~~~~~~~~~~~~~ +Creating a Ray Train Trainer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``Trainer``\s are the primary Ray Train classes that are used to manage state and execute training. You can create a simple ``Trainer`` for the backend of choice @@ -273,8 +273,8 @@ To customize the backend setup, you can use the :ref:`framework-specific config For more configurability, please reference the :py:class:`~ray.train.data_parallel_trainer.DataParallelTrainer` API. -Run training function -~~~~~~~~~~~~~~~~~~~~~ +Running your training function +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ With a distributed training function and a Ray Train ``Trainer``, you are now ready to start training! @@ -407,8 +407,8 @@ of the :py:class:`~ray.air.result.Result` object returned by ``Trainer.fit()``. .. _train-datasets: -Distributed Data Ingest with Ray Datasets ------------------------------------------ +Distributed Data Ingest with Ray Datasets and Ray Train +------------------------------------------------------- :ref:`Ray Datasets ` are the recommended way to work with large datasets in Ray Train. Datasets provides automatic loading, sharding, and pipelined ingest (optional) of Data across multiple Train workers. To get started, pass in one or more datasets under the ``datasets`` keyword argument for Trainer (e.g., ``Trainer(datasets={...})``). @@ -444,8 +444,8 @@ For more details on how to configure data ingest for Train, please refer to :ref .. _train-monitoring: -Logging, Checkpointing and Callbacks ------------------------------------- +Logging, Checkpointing and Callbacks in Ray Train +------------------------------------------------- Ray Train has mechanisms to easily collect intermediate results from the training workers during the training run and also has a :ref:`Callback interface ` to perform actions on these intermediate results (such as logging, aggregations, etc.). diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 12e482c12272..40200eb5821e 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -7,47 +7,115 @@ Ray Train Examples .py files in ray/python/ray/train/examples. Below are examples for using Ray Train with a variety of models, frameworks, -and use cases. +and use cases. You can filter these examples by the following categories: -General Examples ----------------- -PyTorch -~~~~~~~ +.. raw:: html -* :doc:`/train/examples/pytorch/torch_fashion_mnist_example`: - End-to-end example for PyTorch. +
+
All
-* :doc:`/train/examples/transformers/transformers_example`: - End-to-end example for HuggingFace Transformers (PyTorch). + +
PyTorch
+
TensorFlow
+
HuggingFace
+
Horovod
+
MLflow
-TensorFlow -~~~~~~~~~~ + +
Training
+
Tuning
+
-* :doc:`/train/examples/tf/tensorflow_mnist_example`: - End-to-end example for TensorFlow -Horovod -~~~~~~~ +Distributed Training Examples using Ray Train +--------------------------------------------- -* :doc:`/train/examples/horovod/horovod_example`: - End-to-end example for Horovod (with PyTorch) +.. panels:: + :container: container pb-4 full-width + :column: col-md-4 px-2 py-2 + :img-top-cls: pt-5 w-75 d-block mx-auto + --- + :img-top: /images/pytorch_logo.png -Logger/Callback Examples ------------------------- -* :doc:`/train/examples/mlflow_fashion_mnist_example`: - Example for logging training to MLflow via the ``MLflowLoggerCallback`` + +++ + .. link-button:: torch_fashion_mnist_ex + :type: ref + :text: PyTorch Fashion MNIST Training Example + :classes: btn-link btn-block stretched-link trainTorchFashionMnist + --- + :img-top: /images/hugging.png -Ray Tune Integration Examples ------------------------------ + +++ + .. link-button:: train_transformers_example + :type: ref + :text: Transformers with PyTorch Training Example + :classes: btn-link btn-block stretched-link trainTransformers -* :doc:`/train/examples/tf/tune_tensorflow_mnist_example`: - End-to-end example for tuning a TensorFlow model. + --- + :img-top: /images/tf_logo.png -* :doc:`/train/examples/pytorch/tune_cifar_torch_pbt_example`: - End-to-end example for tuning a PyTorch model with PBT. + +++ + .. link-button:: tensorflow_mnist_example + :type: ref + :text: TensorFlow MNIST Training Example + :classes: btn-link btn-block stretched-link trainTensorflowMnist + + --- + :img-top: /images/horovod.png + + +++ + .. link-button:: horovod_example + :type: ref + :text: End-to-end Horovod Training Example + :classes: btn-link btn-block stretched-link trainHorovod + + +Ray Train Examples Using Loggers & Callbacks +-------------------------------------------- + +.. panels:: + :container: container pb-4 full-width + :column: col-md-4 px-2 py-2 + :img-top-cls: pt-5 w-75 d-block mx-auto + + --- + :img-top: /images/mlflow.png + + +++ + .. link-button:: train_mlflow_example + :type: ref + :text: Logging Training Runs with MLflow + :classes: btn-link btn-block stretched-link trainMlflow + + +Ray Train & Tune Integration Examples +------------------------------------- + +.. panels:: + :container: container pb-4 full-width + :column: col-md-4 px-2 py-2 + :img-top-cls: pt-5 w-75 d-block mx-auto + + --- + :img-top: /images/tune.png + + +++ + .. link-button:: tune_train_tf_example + :type: ref + :text: End-to-end Example for Tuning a TensorFlow Model + :classes: btn-link btn-block stretched-link trainTuneTensorflow + + --- + :img-top: /images/tune.png + + +++ + .. link-button:: tune_train_torch_example + :type: ref + :text: End-to-end Example for Tuning a PyTorch Model with PBT + :classes: btn-link btn-block stretched-link trainTunePyTorch .. TODO implement these examples! @@ -63,9 +131,19 @@ Ray Tune Integration Examples * Example training on Vision model. -Benchmarks ----------- +Ray Train Benchmarks +-------------------- + +.. panels:: + :container: container pb-4 full-width + :column: col-md-4 px-2 py-2 + :img-top-cls: pt-5 w-75 d-block mx-auto -* :doc:`/train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example`: - Benchmark example for the PyTorch data transfer auto pipeline. + --- + :img-top: /ray-overview/images/ray_svg_logo.svg + +++ + .. link-button:: train_benchmark + :type: ref + :text: Benchmark example for the PyTorch data transfer auto pipeline + :classes: btn-link btn-block stretched-link trainBenchmark diff --git a/doc/source/train/examples/horovod/horovod_example.rst b/doc/source/train/examples/horovod/horovod_example.rst index 42830d38ecc2..0593a275be09 100644 --- a/doc/source/train/examples/horovod/horovod_example.rst +++ b/doc/source/train/examples/horovod/horovod_example.rst @@ -1,6 +1,8 @@ :orphan: -horovod_example -=============== +.. _horovod_example: + +Horovod Distributed Training Example with PyTorch & Ray Train +============================================================= .. literalinclude:: /../../python/ray/train/examples/horovod/horovod_example.py diff --git a/doc/source/train/examples/mlflow_fashion_mnist_example.rst b/doc/source/train/examples/mlflow_fashion_mnist_example.rst index 5f04779badbd..fb3d6106d36c 100644 --- a/doc/source/train/examples/mlflow_fashion_mnist_example.rst +++ b/doc/source/train/examples/mlflow_fashion_mnist_example.rst @@ -1,6 +1,8 @@ :orphan: -mlflow_fashion_mnist_example -============================ +.. _train_mlflow_example: + +Logging Ray Train Training Runs with MLflow (using Callbacks) +============================================================= .. literalinclude:: /../../python/ray/train/examples/mlflow_fashion_mnist_example.py diff --git a/doc/source/train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example.rst b/doc/source/train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example.rst index 5237191880ba..4c923e9a94cb 100644 --- a/doc/source/train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example.rst +++ b/doc/source/train/examples/pytorch/torch_data_prefetch_benchmark/benchmark_example.rst @@ -1,7 +1,9 @@ :orphan: -Torch Data Prefetching Benchmark -================================ +.. _train_benchmark: + +Torch Data Prefetching Benchmark for Ray Train +============================================== We provide a benchmark example to show how the auto pipeline for host to device data transfer speeds up training on GPUs. This functionality can be easily enabled by setting ``auto_transfer=True`` in :func:`train.torch.prepare_data_loader`. diff --git a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst index 5b5e0d47d480..2955441efaf0 100644 --- a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst +++ b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst @@ -1,6 +1,8 @@ :orphan: -torch_fashion_mnist_example -=========================== +.. _torch_fashion_mnist_ex: + +Running Distributed Training of a PyTorch Model on Fashion MNIST with Ray Train +=============================================================================== .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_fashion_mnist_example.py diff --git a/doc/source/train/examples/pytorch/torch_regression_example.rst b/doc/source/train/examples/pytorch/torch_regression_example.rst index 133f091c1f0a..7ddad6fe21c7 100644 --- a/doc/source/train/examples/pytorch/torch_regression_example.rst +++ b/doc/source/train/examples/pytorch/torch_regression_example.rst @@ -1,3 +1,4 @@ + :orphan: torch_regression_example diff --git a/doc/source/train/examples/pytorch/tune_cifar_torch_pbt_example.rst b/doc/source/train/examples/pytorch/tune_cifar_torch_pbt_example.rst index 6d9c1eaf3abe..a7e9d54e2086 100644 --- a/doc/source/train/examples/pytorch/tune_cifar_torch_pbt_example.rst +++ b/doc/source/train/examples/pytorch/tune_cifar_torch_pbt_example.rst @@ -1,6 +1,8 @@ :orphan: -tune_cifar_torch_pbt_example -============================ +.. _tune_train_torch_example: + +Tuning Hyperparameters of a Distributed PyTorch Model with PBT using Ray Train & Tune +===================================================================================== .. literalinclude:: /../../python/ray/train/examples/pytorch/tune_cifar_torch_pbt_example.py diff --git a/doc/source/train/examples/tf/tensorflow_mnist_example.rst b/doc/source/train/examples/tf/tensorflow_mnist_example.rst index a63bca284a75..0a03a9462d76 100644 --- a/doc/source/train/examples/tf/tensorflow_mnist_example.rst +++ b/doc/source/train/examples/tf/tensorflow_mnist_example.rst @@ -1,6 +1,8 @@ :orphan: -tensorflow_mnist_example -======================== +.. _tensorflow_mnist_example: + +Running Distributed Training of a TensorFlow Model on MNIST with Ray Train +========================================================================== .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_mnist_example.py diff --git a/doc/source/train/examples/tf/tune_tensorflow_mnist_example.rst b/doc/source/train/examples/tf/tune_tensorflow_mnist_example.rst index efc7db7cdc22..724710861565 100644 --- a/doc/source/train/examples/tf/tune_tensorflow_mnist_example.rst +++ b/doc/source/train/examples/tf/tune_tensorflow_mnist_example.rst @@ -1,6 +1,8 @@ :orphan: -tune_tensorflow_mnist_example -============================= +.. _tune_train_tf_example: + +Tuning Hyperparameters of a Distributed TensorFlow Model using Ray Train & Tune +=============================================================================== .. literalinclude:: /../../python/ray/train/examples/tf/tune_tensorflow_mnist_example.py diff --git a/doc/source/train/examples/transformers/transformers_example.rst b/doc/source/train/examples/transformers/transformers_example.rst index 6325b1c4f088..7f7eeb4547fc 100644 --- a/doc/source/train/examples/transformers/transformers_example.rst +++ b/doc/source/train/examples/transformers/transformers_example.rst @@ -1,6 +1,8 @@ :orphan: -transformers_example -==================== +.. _train_transformers_example : + +Ray Train Example for HuggingFace Transformers with PyTorch +=========================================================== .. literalinclude:: /../../python/ray/train/examples/transformers/transformers_example.py diff --git a/doc/source/train/faq.rst b/doc/source/train/faq.rst index 94cfe9f6c799..76534e86f16d 100644 --- a/doc/source/train/faq.rst +++ b/doc/source/train/faq.rst @@ -14,8 +14,8 @@ distributed training communication is done with Torch's ``DistributedDataParalle Take a look at the :ref:`Pytorch ` and :ref:`Tensorflow ` benchmarks to check performance parity. -How do I set resources? ------------------------ +How do I set training resources in Ray Train? +--------------------------------------------- By default, each worker will reserve 1 CPU resource, and an additional 1 GPU resource if ``use_gpu=True``. @@ -28,8 +28,9 @@ you can initialize the ``Trainer`` with ``resources_per_worker`` specified in `` can still be run with Ray Train today without these functions. -My multi-node PyTorch GPU training is hanging or giving me obscure NCCL errors. What do I do? ----------------------------------------------------------------------------------------------- +My multi-node PyTorch GPU training is hanging or giving me obscure NCCL errors. What do I do? +--------------------------------------------------------------------------------------------- + If you are on a multi-node GPU training setup and training is hanging, or you get errors like `RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:911, unhandled system error` it could be that there is some networking misconfiguration in your cluster. diff --git a/doc/source/train/gbdt.rst b/doc/source/train/gbdt.rst index 8855050624b3..a4f1aec6cb68 100644 --- a/doc/source/train/gbdt.rst +++ b/doc/source/train/gbdt.rst @@ -1,12 +1,12 @@ .. _train-gbdt-guide: -XGBoost / LightGBM User Guide -============================= +XGBoost & LightGBM User Guide for Ray Train +=========================================== Ray Train has built-in support for XGBoost and LightGBM. -Basic Usage ------------ +Basic Training with Tree-Based Models in Train +---------------------------------------------- Just as in the original `xgboost.train() `__ and `lightgbm.train() `__ functions, the diff --git a/doc/source/train/getting-started.rst b/doc/source/train/getting-started.rst index 5a5b483fa506..cb59e5f64c9d 100644 --- a/doc/source/train/getting-started.rst +++ b/doc/source/train/getting-started.rst @@ -1,7 +1,7 @@ .. _train-getting-started: -Getting Started -=============== +Getting Started with Distributed Model Training in Ray Train +============================================================ Ray Train offers multiple ``Trainers`` which implement scalable model training for different machine learning frameworks. Here are examples for some of the commonly used trainers: diff --git a/doc/source/train/key-concepts.rst b/doc/source/train/key-concepts.rst index bec0bed8460f..c477099d2104 100644 --- a/doc/source/train/key-concepts.rst +++ b/doc/source/train/key-concepts.rst @@ -1,7 +1,7 @@ .. _train-key-concepts: -Key Concepts -============ +Key Concepts of Ray Train +========================= There are four main concepts in the Ray Train library. @@ -22,6 +22,10 @@ The output of a Trainer run is a :ref:`Result ` that metrics from the training run and the latest saved :ref:`Checkpoint `. Trainers can also be configured with :ref:`Datasets ` and :ref:`Preprocessors ` for scalable data ingest and preprocessing. + +Deep Learning, Tree-Based, and other Trainers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + There are three categories of built-in Trainers: .. tabbed:: Deep Learning Trainers @@ -65,8 +69,8 @@ There are three categories of built-in Trainers: .. _train-key-concepts-config: -Configuration -------------- +Train Configuration +------------------- Trainers are configured with configuration objects. There are two main configuration classes, the :class:`ScalingConfig ` and the :class:`RunConfig `. @@ -77,8 +81,8 @@ Check out the :ref:`Configurations User Guide ` for an in-depth gu .. _train-key-concepts-results: -Checkpoints ------------ +Train Checkpoints +----------------- Calling ``Trainer.fit()`` returns a :class:`Result ` object, which includes information about the run such as the reported metrics and the saved checkpoints. @@ -91,8 +95,8 @@ Checkpoints have the following purposes: .. _train-key-concepts-predictors: -Predictors ----------- +Train Predictors +---------------- Predictors are the counterpart to Trainers. A Trainer trains a model on a dataset, and a predictor uses the resulting model and performs inference on it. diff --git a/doc/source/train/train.rst b/doc/source/train/train.rst index 5d9523b965c1..c79ab6ea278a 100644 --- a/doc/source/train/train.rst +++ b/doc/source/train/train.rst @@ -42,8 +42,8 @@ There are three broad categories of Trainers that Train offers: * Leverage the :ref:`Ray cluster launcher ` to launch autoscaling or spot instance clusters on any cloud. -Quick Start ------------ +Quick Start to Distributed Training with Ray Train +-------------------------------------------------- .. tabbed:: XGBoost @@ -79,10 +79,11 @@ Quick Start .. _train-framework-catalog: -Framework Catalog ------------------ +Training Framework Catalog +-------------------------- -Here is a catalog of the framework-specific Trainer, Checkpoint, and Predictor classes that ship out of the box with Train: +Here is a catalog of the framework-specific Trainer, Checkpoint, and Predictor +classes that ship out of the box with Train: .. list-table:: diff --git a/doc/source/train/user-guides.rst b/doc/source/train/user-guides.rst index cd636041ffe0..175c3de901c7 100644 --- a/doc/source/train/user-guides.rst +++ b/doc/source/train/user-guides.rst @@ -1,5 +1,5 @@ -User Guides -=========== +Ray Train User Guides +===================== .. panels:: :container: container pb-4 full-width