Lightning-AI
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source-app/api_reference/components.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-app/api_reference/components.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/app_multi_node/app.py‎
Lines changed: 11 additions & 0 deletions b/‎examples/app_multi_node/app.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/app_multi_node/.gitignore‎ renamed to ‎examples/app_multi_node/bare/.gitignore‎ b/‎examples/app_multi_node/.gitignore‎ renamed to ‎examples/app_multi_node/bare/.gitignore‎
diff --git a/‎examples/app_multi_node/multi_node.py‎ renamed to ‎examples/app_multi_node/bare/multi_node.py‎ b/‎examples/app_multi_node/multi_node.py‎ renamed to ‎examples/app_multi_node/bare/multi_node.py‎
diff --git a/‎examples/app_multi_node/requirements.txt‎ renamed to ‎examples/app_multi_node/bare/requirements.txt‎ b/‎examples/app_multi_node/requirements.txt‎ renamed to ‎examples/app_multi_node/bare/requirements.txt‎
diff --git a/‎examples/app_multi_node/train.py‎
Lines changed: 7 additions & 0 deletions b/‎examples/app_multi_node/train.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/lightning_app/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎src/lightning_app/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/lightning_app/components/python/tracer.py‎
Lines changed: 46 additions & 2 deletions b/‎src/lightning_app/components/python/tracer.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎src/lightning_app/components/training.py‎
Lines changed: 192 additions & 0 deletions b/‎src/lightning_app/components/training.py‎
Lines changed: 192 additions & 0 deletions
@@ -163,3 +163,4 @@ src/lightning_app/ui/*
 *examples/template_react_ui*
 hars*
 artifacts/*
+*docs/examples*
@@ -20,5 +20,6 @@ ___________________
 
     ~python.popen.PopenPythonScript
     ~python.tracer.TracerPythonScript
+    ~training.LightningTrainingComponent
     ~serve.gradio.ServeGradio
     ~serve.serve.ModelInferenceAPI
@@ -0,0 +1,11 @@
+from lightning import LightningApp
+from lightning.app.components.training import LightningTrainingComponent
+from lightning.app.utilities.packaging.cloud_compute import CloudCompute
+
+app = LightningApp(
+    LightningTrainingComponent(
+        "train.py",
+        num_nodes=2,
+        cloud_compute=CloudCompute("gpu-fast-multi"),
+    ),
+)
@@ -0,0 +1,7 @@
+from lightning.pytorch import Trainer
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+if __name__ == "__main__":
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1)
+    trainer.fit(model)
@@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand`  ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
 
+- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
+
 ### Changed
 
 - Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537))
 
@@ -2,16 +2,24 @@
 import os
 import signal
 import sys
-from typing import Any, Dict, List, Optional, Union
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, TypedDict, Union
 
 from lightning_app import LightningWork
+from lightning_app.storage.drive import Drive
 from lightning_app.storage.payload import Payload
 from lightning_app.utilities.app_helpers import _collect_child_process_pids
+from lightning_app.utilities.packaging.tarfile import clean_tarfile, extract_tarfile
 from lightning_app.utilities.tracer import Tracer
 
 logger = logging.getLogger(__name__)
 
 
+class Code(TypedDict):
+    drive: Drive
+    name: str
+
+
 class TracerPythonScript(LightningWork):
     def on_before_run(self):
         """Called before the python script is executed."""
@@ -31,6 +39,7 @@ def __init__(
         script_args: Optional[Union[list, str]] = None,
         outputs: Optional[List[str]] = None,
         env: Optional[Dict] = None,
+        code: Optional[Code] = None,
         **kwargs,
     ):
         """The TracerPythonScript class enables to easily run a python script.
@@ -97,17 +106,46 @@ def __init__(
         if isinstance(script_args, str):
             script_args = script_args.split(" ")
         self.script_args = script_args if script_args else []
+        self.original_args = deepcopy(self.script_args)
         self.env = env
         self.outputs = outputs or []
         for name in self.outputs:
             setattr(self, name, None)
+        self.params = None
+        self.drive = code.get("drive") if code else None
+        self.code_name = code.get("name") if code else None
+        self.restart_count = 0
+
+    def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs):
+        """
+        Arguments:
+            params: A dictionary of arguments to be be added to script_args.
+            restart_count: Passes an incrementing counter to enable the re-execution of LightningWorks.
+        """
+        if restart_count:
+            self.restart_count = restart_count
+
+        if params:
+            self.params = params
+            self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()]
+
+        if self.drive:
+            assert self.code_name
+            if os.path.exists(self.code_name):
+                clean_tarfile(self.code_name, "r:gz")
+
+            if self.code_name in self.drive.list():
+                self.drive.get(self.code_name)
+                extract_tarfile(self.code_name, ".", "r:gz")
 
-    def run(self, **kwargs):
         if not os.path.exists(self.script_path):
             raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.")
+
         kwargs = {k: v.value if isinstance(v, Payload) else v for k, v in kwargs.items()}
+
         init_globals = globals()
         init_globals.update(kwargs)
+
         self.on_before_run()
         env_copy = os.environ.copy()
         if self.env:
@@ -125,5 +163,11 @@ def on_exit(self):
         for child_pid in _collect_child_process_pids(os.getpid()):
             os.kill(child_pid, signal.SIGTERM)
 
+    @staticmethod
+    def _to_script_args(k: str, v: str) -> str:
+        if k.startswith("--"):
+            return f"{k}={v}"
+        return f"--{k}={v}"
+
 
 __all__ = ["TracerPythonScript"]
@@ -0,0 +1,192 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+from lightning import CloudCompute
+from lightning_app import LightningFlow, structures
+from lightning_app.components.python import TracerPythonScript
+from lightning_app.storage.path import Path
+
+_logger = logging.getLogger(__name__)
+
+
+class PyTorchLightningScriptRunner(TracerPythonScript):
+    def __init__(
+        self,
+        script_path: str,
+        script_args: Optional[Union[list, str]] = None,
+        node_rank: int = 1,
+        num_nodes: int = 1,
+        sanity_serving: bool = False,
+        cloud_compute: Optional[CloudCompute] = None,
+        parallel: bool = True,
+        raise_exception: bool = True,
+        env: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            script_path,
+            script_args,
+            raise_exception=raise_exception,
+            parallel=parallel,
+            cloud_compute=cloud_compute,
+            **kwargs,
+        )
+        self.node_rank = node_rank
+        self.num_nodes = num_nodes
+        self.best_model_path = None
+        self.best_model_score = None
+        self.monitor = None
+        self.sanity_serving = sanity_serving
+        self.has_finished = False
+        self.env = env
+
+    def configure_tracer(self):
+        from pytorch_lightning import Trainer
+
+        tracer = super().configure_tracer()
+        tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware)
+        return tracer
+
+    def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs) -> None:
+        if not internal_urls:
+            # Note: This is called only once.
+            _logger.info(f"The node {self.node_rank} started !")
+            return None
+
+        if self.env:
+            os.environ.update(self.env)
+
+        distributed_env_vars = {
+            "MASTER_ADDR": internal_urls[0][0],
+            "MASTER_PORT": str(internal_urls[0][1]),
+            "NODE_RANK": str(self.node_rank),
+            "PL_TRAINER_NUM_NODES": str(self.num_nodes),
+            "PL_TRAINER_DEVICES": "auto",
+            "PL_TRAINER_ACCELERATOR": "auto",
+        }
+
+        os.environ.update(distributed_env_vars)
+        return super().run(**kwargs)
+
+    def on_after_run(self, script_globals):
+        from pytorch_lightning import Trainer
+        from pytorch_lightning.cli import LightningCLI
+
+        for v in script_globals.values():
+            if isinstance(v, LightningCLI):
+                trainer = v.trainer
+                break
+            elif isinstance(v, Trainer):
+                trainer = v
+                break
+        else:
+            raise RuntimeError("No trainer instance found.")
+
+        self.monitor = trainer.checkpoint_callback.monitor
+
+        if trainer.checkpoint_callback.best_model_score:
+            self.best_model_path = Path(trainer.checkpoint_callback.best_model_path)
+            self.best_model_score = float(trainer.checkpoint_callback.best_model_score)
+        else:
+            self.best_model_path = Path(trainer.checkpoint_callback.last_model_path)
+
+        self.has_finished = True
+
+    def _trainer_init_pre_middleware(self, trainer, *args, **kwargs):
+        if self.node_rank != 0:
+            return {}, args, kwargs
+
+        from pytorch_lightning.serve import ServableModuleValidator
+
+        callbacks = kwargs.get("callbacks", [])
+        if self.sanity_serving:
+            callbacks = callbacks + [ServableModuleValidator()]
+        kwargs["callbacks"] = callbacks
+        return {}, args, kwargs
+
+    @property
+    def is_running_in_cloud(self) -> bool:
+        return "LIGHTNING_APP_STATE_URL" in os.environ
+
+
+class LightningTrainingComponent(LightningFlow):
+    def __init__(
+        self,
+        script_path: str,
+        script_args: Optional[Union[list, str]] = None,
+        num_nodes: int = 1,
+        cloud_compute: CloudCompute = CloudCompute("default"),
+        sanity_serving: bool = False,
+        script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner,
+        **script_runner_kwargs,
+    ):
+        """This component enables performing distributed multi-node multi-device training.
+
+        Example::
+
+            from lightning import LightningApp
+            from lightning.app.components.training import LightningTrainingComponent
+            from lightning.app.utilities.packaging.cloud_compute import CloudCompute
+
+            app = LightningApp(
+                LightningTrainingComponent(
+                    "train.py",
+                    num_nodes=2,
+                    cloud_compute=CloudCompute("gpu"),
+                ),
+            )
+
+        Arguments:
+            script_path: Path to the script to be executed.
+            script_args: The arguments to be pass to the script.
+            num_nodes: Number of nodes.
+            cloud_compute: The cloud compute object used in the cloud.
+            sanity_serving: Whether to validate that the model correctly implements
+                the ServableModule API
+        """
+        super().__init__()
+        self.ws = structures.List()
+        self.has_initialized = False
+        self.script_path = script_path
+        self.script_args = script_args
+        self.num_nodes = num_nodes
+        self._cloud_compute = cloud_compute  # TODO: Add support for cloudCompute
+        self.sanity_serving = sanity_serving
+        self._script_runner = script_runner
+        self._script_runner_kwargs = script_runner_kwargs
+
+    def run(self, **run_kwargs):
+        if not self.has_initialized:
+            for node_rank in range(self.num_nodes):
+                self.ws.append(
+                    self._script_runner(
+                        script_path=self.script_path,
+                        script_args=self.script_args,
+                        cloud_compute=self._cloud_compute,
+                        node_rank=node_rank,
+                        sanity_serving=self.sanity_serving,
+                        num_nodes=self.num_nodes,
+                        **self._script_runner_kwargs,
+                    )
+                )
+
+            self.has_initialized = True
+
+        for work in self.ws:
+            if all(w.internal_ip for w in self.ws):
+                internal_urls = [(w.internal_ip, w.port) for w in self.ws]
+                work.run(internal_urls=internal_urls, **run_kwargs)
+                if all(w.has_finished for w in self.ws):
+                    for w in self.ws:
+                        w.stop()
+            else:
+                work.run()
+
+    @property
+    def best_model_score(self) -> Optional[float]:
+        return self.ws[0].best_model_score
+
+    @property
+    def best_model_paths(self) -> List[Optional[Path]]:
+        return [self.ws[node_idx].best_mode_path for node_idx in range(len(self.ws))]