diff --git a/pyproject.toml b/pyproject.toml index c0bac18466c85..5508e98981877 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,7 @@ exclude = [ "src/lightning/app/cli/component-template", "src/lightning/app/cli/pl-app-template", "src/lightning/app/cli/react-ui-template", + "src/lightning/app/launcher", ] install_types = "True" non_interactive = "True" diff --git a/src/lightning/app/cli/lightning_cli.py b/src/lightning/app/cli/lightning_cli.py index 43c9e82ff477f..9bf8877fac975 100644 --- a/src/lightning/app/cli/lightning_cli.py +++ b/src/lightning/app/cli/lightning_cli.py @@ -39,6 +39,7 @@ ) from lightning.app.cli.connect.data import connect_data from lightning.app.cli.lightning_cli_delete import delete +from lightning.app.cli.lightning_cli_launch import launch from lightning.app.cli.lightning_cli_list import get_list from lightning.app.core.constants import ENABLE_APP_COMMENT_COMMAND_EXECUTION, get_lightning_cloud_url from lightning.app.runners.cloud import CloudRuntime @@ -324,6 +325,7 @@ def open(path: str, name: str) -> None: _main.add_command(get_list) _main.add_command(delete) +_main.add_command(launch) _main.add_command(cmd_install.install) diff --git a/src/lightning/app/cli/lightning_cli_launch.py b/src/lightning/app/cli/lightning_cli_launch.py new file mode 100644 index 0000000000000..8cf56453d86f9 --- /dev/null +++ b/src/lightning/app/cli/lightning_cli_launch.py @@ -0,0 +1,127 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Tuple + +import click + +from lightning.app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT +from lightning.app.launcher.launcher import ( + run_lightning_flow, + run_lightning_work, + serve_frontend, + start_application_server, + start_flow_and_servers, +) + +logger = logging.getLogger(__name__) + + +@click.group(name="launch", hidden=True) +def launch() -> None: + """Launch your application.""" + + +@launch.command("server", hidden=True) +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) +@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) +def run_server(file: str, queue_id: str, host: str, port: int) -> None: + """It takes the application file as input, build the application object and then use that to run the + application server. + + This is used by the cloud runners to start the status server for the application + """ + logger.debug(f"Run Server: {file} {queue_id} {host} {port}") + start_application_server(file, host, port, queue_id=queue_id) + + +@launch.command("flow", hidden=True) +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--base-url", help="Base url at which the app server is hosted", default="") +def run_flow(file: str, queue_id: str, base_url: str) -> None: + """It takes the application file as input, build the application object, proxy all the work components and then + run the application flow defined in the root component. + + It does exactly what a singleprocess dispatcher would do but with proxied work components. + """ + logger.debug(f"Run Flow: {file} {queue_id} {base_url}") + run_lightning_flow(file, queue_id=queue_id, base_url=base_url) + + +@launch.command("work", hidden=True) +@click.argument("file", type=click.Path(exists=True)) +@click.option("--work-name", type=str) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +def run_work(file: str, work_name: str, queue_id: str) -> None: + """Unlike other entrypoints, this command will take the file path or module details for a work component and + run that by fetching the states from the queues.""" + logger.debug(f"Run Work: {file} {work_name} {queue_id}") + run_lightning_work( + file=file, + work_name=work_name, + queue_id=queue_id, + ) + + +@launch.command("frontend", hidden=True) +@click.argument("file", type=click.Path(exists=True)) +@click.option("--flow-name") +@click.option("--host") +@click.option("--port", type=int) +def run_frontend(file: str, flow_name: str, host: str, port: int) -> None: + """Serve the frontend specified by the given flow.""" + logger.debug(f"Run Frontend: {file} {flow_name} {host}") + serve_frontend(file=file, flow_name=flow_name, host=host, port=port) + + +@launch.command("flow-and-servers", hidden=True) +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--base-url", help="Base url at which the app server is hosted", default="") +@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) +@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) +@click.option( + "--flow-port", + help="Pair of flow name and frontend port", + type=(str, int), + multiple=True, +) +def run_flow_and_servers( + file: str, + base_url: str, + queue_id: str, + host: str, + port: int, + flow_port: Tuple[Tuple[str, int]], +) -> None: + """It takes the application file as input, build the application object and then use that to run the + application flow defined in the root component, the application server and all the flow frontends. + + This is used by the cloud runners to start the flow, the status server and all frontends for the application + """ + logger.debug(f"Run Flow: {file} {queue_id} {base_url}") + logger.debug(f"Run Server: {file} {queue_id} {host} {port}.") + logger.debug(f"Run Frontend's: {flow_port}") + start_flow_and_servers( + entrypoint_file=file, + base_url=base_url, + queue_id=queue_id, + host=host, + port=port, + flow_names_and_ports=flow_port, + ) diff --git a/src/lightning/app/launcher/__init__.py b/src/lightning/app/launcher/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/src/lightning/app/launcher/launcher.py b/src/lightning/app/launcher/launcher.py new file mode 100644 index 0000000000000..8f00731161dfc --- /dev/null +++ b/src/lightning/app/launcher/launcher.py @@ -0,0 +1,439 @@ +import inspect +import logging +import os +import signal +import sys +import time +import traceback +from functools import partial +from multiprocessing import Process +from typing import Callable, Dict, List, Optional, Tuple, TypedDict + +ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER = bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0"))) + +if True: # Avoid Module level import not at top of file + from lightning.app import LightningFlow + from lightning.app.core import constants + from lightning.app.core.api import start_server + from lightning.app.core.queues import MultiProcessQueue, QueuingSystem + from lightning.app.storage.orchestrator import StorageOrchestrator + from lightning.app.utilities.app_commands import run_app_commands + from lightning.app.utilities.cloud import _sigterm_flow_handler + from lightning.app.utilities.component import _set_flow_context, _set_frontend_context + from lightning.app.utilities.enum import AppStage + from lightning.app.utilities.exceptions import ExitAppException + from lightning.app.utilities.load_app import extract_metadata_from_app, load_app_from_file + from lightning.app.utilities.proxies import WorkRunner + from lightning.app.utilities.redis import check_if_redis_running + +if ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER: + from lightning.app.launcher.lightning_hybrid_backend import CloudHybridBackend as CloudBackend +else: + from lightning.app.launcher.lightning_backend import CloudBackend + +if True: # Avoid Module level import not at top of file + from lightning.app.utilities.app_helpers import convert_print_to_logger_info + from lightning.app.utilities.packaging.lightning_utils import enable_debugging + +if hasattr(constants, "get_cloud_queue_type"): + CLOUD_QUEUE_TYPE = constants.get_cloud_queue_type() or "redis" +else: + CLOUD_QUEUE_TYPE = "redis" + +logger = logging.getLogger(__name__) + + +class FlowRestAPIQueues(TypedDict): + api_publish_state_queue: MultiProcessQueue + api_response_queue: MultiProcessQueue + + +@convert_print_to_logger_info +@enable_debugging +def start_application_server( + entrypoint_file: str, host: str, port: int, queue_id: str, queues: Optional[FlowRestAPIQueues] = None +): + logger.debug(f"Run Lightning Work {entrypoint_file} {host} {port} {queue_id}") + queue_system = QueuingSystem(CLOUD_QUEUE_TYPE) + + wait_for_queues(queue_system) + + kwargs = { + "api_delta_queue": queue_system.get_api_delta_queue(queue_id=queue_id), + } + + # Note: Override the queues if provided + if isinstance(queues, Dict): + kwargs.update(queues) + else: + kwargs.update( + { + "api_publish_state_queue": queue_system.get_api_state_publish_queue(queue_id=queue_id), + "api_response_queue": queue_system.get_api_response_queue(queue_id=queue_id), + } + ) + + app = load_app_from_file(entrypoint_file) + + from lightning.app.api.http_methods import _add_tags_to_api, _validate_api + from lightning.app.utilities.app_helpers import is_overridden + from lightning.app.utilities.commands.base import _commands_to_api, _prepare_commands + + apis = [] + if is_overridden("configure_api", app.root): + apis = app.root.configure_api() + _validate_api(apis) + _add_tags_to_api(apis, ["app_api"]) + + if is_overridden("configure_commands", app.root): + commands = _prepare_commands(app) + apis += _commands_to_api(commands) + + start_server( + host=host, + port=port, + apis=apis, + **kwargs, + spec=extract_metadata_from_app(app), + ) + + +@convert_print_to_logger_info +@enable_debugging +def run_lightning_work( + file: str, + work_name: str, + queue_id: str, +): + """This staticmethod runs the specified work in the current process. + + It is organized under cloud runtime to indicate that it will be used by the cloud runner but otherwise, no cloud + specific logic is being implemented here + """ + logger.debug(f"Run Lightning Work {file} {work_name} {queue_id}") + + queues = QueuingSystem(CLOUD_QUEUE_TYPE) + wait_for_queues(queues) + + caller_queue = queues.get_caller_queue(work_name=work_name, queue_id=queue_id) + readiness_queue = queues.get_readiness_queue(queue_id=queue_id) + delta_queue = queues.get_delta_queue(queue_id=queue_id) + error_queue = queues.get_error_queue(queue_id=queue_id) + + request_queues = queues.get_orchestrator_request_queue(work_name=work_name, queue_id=queue_id) + response_queues = queues.get_orchestrator_response_queue(work_name=work_name, queue_id=queue_id) + copy_request_queues = queues.get_orchestrator_copy_request_queue(work_name=work_name, queue_id=queue_id) + copy_response_queues = queues.get_orchestrator_copy_response_queue(work_name=work_name, queue_id=queue_id) + + run_app_commands(file) + + load_app_from_file(file) + + queue = queues.get_work_queue(work_name=work_name, queue_id=queue_id) + work = queue.get() + + extras = {} + + if hasattr(work, "_run_executor_cls"): + extras["run_executor_cls"] = work._run_executor_cls + + WorkRunner( + work=work, + work_name=work_name, + caller_queue=caller_queue, + delta_queue=delta_queue, + readiness_queue=readiness_queue, + error_queue=error_queue, + request_queue=request_queues, + response_queue=response_queues, + copy_request_queue=copy_request_queues, + copy_response_queue=copy_response_queues, + **extras, + )() + + +@convert_print_to_logger_info +@enable_debugging +def run_lightning_flow(entrypoint_file: str, queue_id: str, base_url: str, queues: Optional[FlowRestAPIQueues] = None): + _set_flow_context() + + logger.debug(f"Run Lightning Flow {entrypoint_file} {queue_id} {base_url}") + + app = load_app_from_file(entrypoint_file) + app.backend = CloudBackend(entrypoint_file, queue_id=queue_id) + + queue_system = app.backend.queues + app.backend.update_lightning_app_frontend(app) + wait_for_queues(queue_system) + + app.backend.resolve_url(app, base_url) + if app.root_path != "": + app._update_index_file() + app.backend._prepare_queues(app) + + # Note: Override the queues if provided + if queues: + app.api_publish_state_queue = queues["api_publish_state_queue"] + app.api_response_queue = queues["api_response_queue"] + + LightningFlow._attach_backend(app.root, app.backend) + + app.should_publish_changes_to_api = True + + storage_orchestrator = StorageOrchestrator( + app, + app.request_queues, + app.response_queues, + app.copy_request_queues, + app.copy_response_queues, + ) + storage_orchestrator.setDaemon(True) + storage_orchestrator.start() + + # refresh the layout with the populated urls. + app._update_layout() + + # register a signal handler to clean all works. + if sys.platform != "win32": + signal.signal(signal.SIGTERM, partial(_sigterm_flow_handler, app=app)) + + if "apis" in inspect.signature(start_server).parameters: + from lightning.app.utilities.commands.base import _prepare_commands + + _prepare_commands(app) + + # Once the bootstrapping is done, running the rank 0 + # app with all the components inactive + try: + app._run() + except ExitAppException: + pass + except Exception: + app.stage = AppStage.FAILED + print(traceback.format_exc()) + + storage_orchestrator.join(0) + app.backend.stop_all_works(app.works) + + exit_code = 1 if app.stage == AppStage.FAILED else 0 + print(f"Finishing the App with exit_code: {str(exit_code)}...") + + if not exit_code: + app.backend.stop_app(app) + + sys.exit(exit_code) + + +@convert_print_to_logger_info +@enable_debugging +def serve_frontend(file: str, flow_name: str, host: str, port: int): + """This staticmethod runs the specified frontend for a given flow in a new process. + + It is organized under cloud runtime to indicate that it will be used by the cloud runner but otherwise, no cloud + specific logic is being implemented here. + """ + _set_frontend_context() + logger.debug(f"Run Serve Frontend {file} {flow_name} {host} {port}") + app = load_app_from_file(file) + if flow_name not in app.frontends: + raise ValueError(f"Could not find frontend for flow with name {flow_name}.") + frontend = app.frontends[flow_name] + assert frontend.flow.name == flow_name + + frontend.start_server(host, port) + + +def start_server_in_process(target: Callable, args: Tuple = (), kwargs: Dict = {}) -> Process: + p = Process(target=target, args=args, kwargs=kwargs) + p.start() + return p + + +def format_row(elements, col_widths, padding=1): + elements = [el.ljust(w - padding * 2) for el, w in zip(elements, col_widths)] + pad = " " * padding + elements = [f"{pad}{el}{pad}" for el in elements] + return f'|{"|".join(elements)}|' + + +def tabulate(data, headers): + data = [[str(el) for el in row] for row in data] + col_widths = [len(el) for el in headers] + for row in data: + col_widths = [max(len(el), curr) for el, curr in zip(row, col_widths)] + col_widths = [w + 2 for w in col_widths] + seps = ["-" * w for w in col_widths] + lines = [format_row(headers, col_widths), format_row(seps, col_widths, padding=0)] + [ + format_row(row, col_widths) for row in data + ] + return "\n".join(lines) + + +def manage_server_processes(processes: List[Tuple[str, Process]]) -> None: + if not processes: + return + + sigterm_called = [False] + + def _sigterm_handler(*_): + sigterm_called[0] = True + + if sys.platform != "win32": + signal.signal(signal.SIGTERM, _sigterm_handler) + + # Since frontends run user code, any of them could fail. In that case, + # we want to fail all of them, as well as the application server, and + # exit the command with an error status code. + + exitcode = 0 + + while True: + # We loop until + # 1. Get a sigterm + # 2. All the children died but all with exit code 0 + # 3. At-least one of the child died with non-zero exit code + + # sleeping quickly at the starting of every loop + # moving this to the end of the loop might result in some flaky tests + time.sleep(1) + + if sigterm_called[0]: + print("Got SIGTERM. Exiting execution!!!") + break + if all(not p.is_alive() and p.exitcode == 0 for _, p in processes): + print("All the components are inactive with exitcode 0. Exiting execution!!!") + break + if any((not p.is_alive() and p.exitcode != 0) for _, p in processes): + print("Found dead components with non-zero exit codes, exiting execution!!! Components: ") + print( + tabulate( + [(name, p.exitcode) for name, p in processes if not p.is_alive() and p.exitcode != 0], + headers=["Name", "Exit Code"], + ) + ) + exitcode = 1 + break + + # sleeping for the last set of logs to reach stdout + time.sleep(2) + + # Cleanup + for _, p in processes: + if p.is_alive(): + os.kill(p.pid, signal.SIGTERM) + + # Give processes time to terminate + for _, p in processes: + p.join(5) + + # clean the remaining ones. + if any(p.is_alive() for _, p in processes): + for _, p in processes: + if p.is_alive(): + os.kill(p.pid, signal.SIGKILL) + + # this sleep is just a precaution - signals might take a while to get raised. + time.sleep(1) + sys.exit(1) + + sys.exit(exitcode) + + +def _get_frontends_from_app(entrypoint_file): + """This function is used to get the frontends from the app. It will be used to start the frontends in a + separate process if the backend cannot provide flow_names_and_ports. This is useful if the app cannot be loaded + locally to set the frontend before dispatching to the cloud. The backend exposes by default 10 ports from 8081 + if the app.spec.frontends is not set. + + NOTE: frontend_name are sorted to ensure that they get consistent ports. + + :param entrypoint_file: The entrypoint file for the app + :return: A list of tuples of the form (frontend_name, port_number) + """ + app = load_app_from_file(entrypoint_file) + + frontends = [] + # This value of the port should be synced with the port value in the backend. + # If you change this value, you should also change the value in the backend. + flow_frontends_starting_port = 8081 + for frontend in sorted(app.frontends.keys()): + frontends.append((frontend, flow_frontends_starting_port)) + flow_frontends_starting_port += 1 + + return frontends + + +@convert_print_to_logger_info +@enable_debugging +def start_flow_and_servers( + entrypoint_file: str, + base_url: str, + queue_id: str, + host: str, + port: int, + flow_names_and_ports: Tuple[Tuple[str, int]], +): + processes: List[Tuple[str, Process]] = [] + + # Queues between Flow and its Rest API are using multiprocessing to: + # - reduce redis load + # - increase UI responsiveness and RPS + queue_system = QueuingSystem.MULTIPROCESS + queues = { + "api_publish_state_queue": queue_system.get_api_state_publish_queue(queue_id=queue_id), + "api_response_queue": queue_system.get_api_response_queue(queue_id=queue_id), + } + + # In order to avoid running this function 3 seperate times while executing the + # `run_lightning_flow`, `start_application_server`, & `serve_frontend` functions + # in a subprocess we extract this to the top level. If we intend to make changes + # to be able to start these components in seperate containers, the implementation + # will have to move a call to this function within the initialization process. + run_app_commands(entrypoint_file) + + flow_process = start_server_in_process( + run_lightning_flow, + args=( + entrypoint_file, + queue_id, + base_url, + ), + kwargs={"queues": queues}, + ) + processes.append(("Flow", flow_process)) + + server_process = start_server_in_process( + target=start_application_server, + args=( + entrypoint_file, + host, + port, + queue_id, + ), + kwargs={"queues": queues}, + ) + processes.append(("Server", server_process)) + + if not flow_names_and_ports: + flow_names_and_ports = _get_frontends_from_app(entrypoint_file) + + for name, fe_port in flow_names_and_ports: + frontend_process = start_server_in_process(target=serve_frontend, args=(entrypoint_file, name, host, fe_port)) + processes.append((name, frontend_process)) + + manage_server_processes(processes) + + +def wait_for_queues(queue_system: QueuingSystem) -> None: + queue_check_start_time = int(time.time()) + + if hasattr(queue_system, "get_queue"): + while not queue_system.get_queue("healthz").is_running: + if (int(time.time()) - queue_check_start_time) % 10 == 0: + logger.warning("Waiting for http queues to start...") + time.sleep(1) + else: + while not check_if_redis_running(): + if (int(time.time()) - queue_check_start_time) % 10 == 0: + logger.warning("Waiting for redis queues to start...") + time.sleep(1) diff --git a/src/lightning/app/launcher/lightning_backend.py b/src/lightning/app/launcher/lightning_backend.py new file mode 100644 index 0000000000000..1e3c096e45cf1 --- /dev/null +++ b/src/lightning/app/launcher/lightning_backend.py @@ -0,0 +1,523 @@ +import inspect +import json +import logging +import os +import random +import string +import urllib +from time import monotonic, sleep, time +from typing import List, Optional + +from lightning_cloud.openapi import ( + AppinstancesIdBody, + Externalv1LightningappInstance, + Externalv1Lightningwork, + V1BuildSpec, + V1Drive, + V1DriveSpec, + V1DriveStatus, + V1DriveType, + V1Flowserver, + V1LightningappInstanceState, + V1LightningappRestartPolicy, + V1LightningworkClusterDriver, + V1LightningworkDrives, + V1LightningworkSpec, + V1LightningworkState, + V1ListLightningworkResponse, + V1Metadata, + V1NetworkConfig, + V1PackageManager, + V1PythonDependencyInfo, + V1SourceType, + V1UserRequestedComputeConfig, +) +from lightning_cloud.openapi.rest import ApiException + +from lightning.app import LightningApp, LightningWork +from lightning.app.core.queues import QueuingSystem +from lightning.app.runners.backends.backend import Backend +from lightning.app.storage import Drive, Mount +from lightning.app.utilities.enum import make_status, WorkStageStatus, WorkStopReasons +from lightning.app.utilities.exceptions import LightningPlatformException +from lightning.app.utilities.network import _check_service_url_is_ready, LightningClient + +logger = logging.getLogger(__name__) + +from lightning_cloud.openapi import SpecLightningappInstanceIdWorksBody, WorksIdBody # noqa: E402 + +LIGHTNING_STOP_TIMEOUT = int(os.getenv("LIGHTNING_STOP_TIMEOUT", 2 * 60)) + + +def cloud_work_stage_to_work_status_stage(stage: V1LightningworkState) -> str: + """Maps the Work stage names from the cloud backend to the status names in the Lightning framework.""" + mapping = { + V1LightningworkState.STOPPED: WorkStageStatus.STOPPED, + V1LightningworkState.PENDING: WorkStageStatus.PENDING, + V1LightningworkState.NOT_STARTED: WorkStageStatus.PENDING, + V1LightningworkState.IMAGE_BUILDING: WorkStageStatus.PENDING, + V1LightningworkState.RUNNING: WorkStageStatus.RUNNING, + V1LightningworkState.FAILED: WorkStageStatus.FAILED, + } + if stage not in mapping: + raise ValueError(f"Cannot map the lightning-cloud work state {stage} to the lightning status stage.") + return mapping[stage] + + +class CloudBackend(Backend): + def __init__( + self, + entrypoint_file, + queue_id: Optional[str] = None, + status_update_interval: int = 5, + ) -> None: + # TODO: Properly handle queue_id in the cloud. + super().__init__(entrypoint_file, queues=QueuingSystem("http"), queue_id=queue_id) + self._status_update_interval = status_update_interval + self._last_time_updated = None + self.client = LightningClient(retry=True) + self.base_url: Optional[str] = None + + @staticmethod + def _work_to_spec(work: LightningWork) -> V1LightningworkSpec: + work_requirements = "\n".join(work.cloud_build_config.requirements) + + build_spec = V1BuildSpec( + commands=work.cloud_build_config.build_commands(), + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages=work_requirements + ), + image=work.cloud_build_config.image, + ) + + drive_specs: List[V1LightningworkDrives] = [] + for drive_attr_name, drive in [ + (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive) + ]: + if drive.protocol == "lit://": + drive_type = V1DriveType.NO_MOUNT_S3 + source_type = V1SourceType.S3 + else: + drive_type = V1DriveType.UNSPECIFIED + source_type = V1SourceType.UNSPECIFIED + + drive_specs.append( + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata(name=f"{work.name}.{drive_attr_name}"), + spec=V1DriveSpec( + drive_type=drive_type, + source_type=source_type, + source=f"{drive.protocol}{drive.id}", + ), + status=V1DriveStatus(), + ), + mount_location=str(drive.root_folder), + ), + ) + + # this should really be part of the work.cloud_compute struct, but to save + # time we are not going to modify the backend in this set of PRs & instead + # use the same s3 drives API which we used before. + if work.cloud_compute.mounts is not None: + if isinstance(work.cloud_compute.mounts, Mount): + drive_specs.append( + _create_mount_drive_spec( + work_name=work.name, + mount=work.cloud_compute.mounts, + ) + ) + else: + for mount in work.cloud_compute.mounts: + drive_specs.append( + _create_mount_drive_spec( + work_name=work.name, + mount=mount, + ) + ) + + if hasattr(work.cloud_compute, "interruptible"): + preemptible = work.cloud_compute.interruptible + else: + preemptible = work.cloud_compute.preemptible + + colocation_group_id = None + if hasattr(work.cloud_compute, "colocation_group_id"): + colocation_group_id = work.cloud_compute.colocation_group_id + + user_compute_config = V1UserRequestedComputeConfig( + name=work.cloud_compute.name, + count=1, + disk_size=work.cloud_compute.disk_size, + preemptible=preemptible, + shm_size=work.cloud_compute.shm_size, + affinity_identifier=colocation_group_id, + ) + + random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) # noqa: S311 + + return V1LightningworkSpec( + build_spec=build_spec, + drives=drive_specs, + user_requested_compute_config=user_compute_config, + network_config=[V1NetworkConfig(name=random_name, port=work.port)], + desired_state=V1LightningworkState.RUNNING, + restart_policy=V1LightningappRestartPolicy.NEVER, + cluster_driver=V1LightningworkClusterDriver.DIRECT, + ) + + def create_work(self, app: LightningApp, work: LightningWork) -> None: + app_id = self._get_app_id() + project_id = self._get_project_id() + list_response: V1ListLightningworkResponse = self.client.lightningwork_service_list_lightningwork( + project_id=project_id, app_id=app_id + ) + external_specs: List[Externalv1Lightningwork] = list_response.lightningworks + + # Find THIS work in the list of all registered works + external_spec = None + for es in external_specs: + if es.name == work.name: + external_spec = es + break + + if external_spec is None: + spec = self._work_to_spec(work) + try: + fn = SpecLightningappInstanceIdWorksBody.__init__ + params = list(inspect.signature(fn).parameters) + extras = {} + if "display_name" in params: + extras["display_name"] = getattr(work, "display_name", "") + + external_spec = self.client.lightningwork_service_create_lightningwork( + project_id=project_id, + spec_lightningapp_instance_id=app_id, + body=SpecLightningappInstanceIdWorksBody( + name=work.name, + spec=spec, + **extras, + ), + ) + # overwriting spec with return value + spec = external_spec.spec + except ApiException as e: + # We might get exceed quotas, or be out of credits. + message = json.loads(e.body).get("message") + raise LightningPlatformException(message) from None + elif external_spec.spec.desired_state == V1LightningworkState.RUNNING: + spec = external_spec.spec + work._port = spec.network_config[0].port + else: + # Signal the LightningWorkState to go into state RUNNING + spec = external_spec.spec + + # getting the updated spec but ignoring everything other than port & drives + new_spec = self._work_to_spec(work) + + spec.desired_state = V1LightningworkState.RUNNING + spec.network_config[0].port = new_spec.network_config[0].port + spec.drives = new_spec.drives + spec.user_requested_compute_config = new_spec.user_requested_compute_config + spec.build_spec = new_spec.build_spec + spec.env = new_spec.env + try: + self.client.lightningwork_service_update_lightningwork( + project_id=project_id, + id=external_spec.id, + spec_lightningapp_instance_id=app_id, + body=WorksIdBody(spec), + ) + except ApiException as e: + # We might get exceed quotas, or be out of credits. + message = json.loads(e.body).get("message") + raise LightningPlatformException(message) from None + + # Replace the undefined url and host by the known one. + work._host = "0.0.0.0" # noqa: S104 + work._future_url = f"{self._get_proxy_scheme()}://{spec.network_config[0].host}" + + # removing the backend to avoid the threadlock error + _backend = work._backend + work._backend = None + app.work_queues[work.name].put(work) + work._backend = _backend + + logger.info(f"Starting work {work.name}") + logger.debug(f"With the following external spec: {external_spec}") + + def update_work_statuses(self, works: List[LightningWork]) -> None: + """Pulls the status of each Work instance in the cloud. + + Normally, the Lightning frameworks communicates statuses through the queues, but while the Work instance is + being provisionied, the queues don't exist yet and hence we need to make API calls directly to the backend to + fetch the status and update it in the states. + """ + if not works: + return + + # TODO: should this run in a timer thread instead? + if self._last_time_updated is not None and monotonic() - self._last_time_updated < self._status_update_interval: + return + + cloud_work_specs = self._get_cloud_work_specs(self.client) + local_works = works + for cloud_work_spec in cloud_work_specs: + for local_work in local_works: + # TODO (tchaton) Better resolve pending status after succeeded + + # 1. Skip if the work isn't the current one. + if local_work.name != cloud_work_spec.name: + continue + + # 2. Logic for idle timeout + self._handle_idle_timeout( + local_work.cloud_compute.idle_timeout, + local_work, + cloud_work_spec, + ) + + # 3. Map the cloud phase to the local one + cloud_stage = cloud_work_stage_to_work_status_stage( + cloud_work_spec.status.phase, + ) + + # 4. Detect if the work failed during pending phase + if local_work.status.stage == WorkStageStatus.PENDING and cloud_stage in WorkStageStatus.FAILED: + if local_work._raise_exception: + raise Exception(f"The work {local_work.name} failed during pending phase.") + logger.error(f"The work {local_work.name} failed during pending phase.") + + # 5. Skip the pending and running as this is already handled by Lightning. + if cloud_stage in (WorkStageStatus.PENDING, WorkStageStatus.RUNNING): + continue + + # TODO: Add the logic for wait_timeout + if local_work.status.stage != cloud_stage: + latest_hash = local_work._calls["latest_call_hash"] + if latest_hash is None: + continue + local_work._calls[latest_hash]["statuses"].append(make_status(cloud_stage)) + + self._last_time_updated = monotonic() + + def stop_all_works(self, works: List[LightningWork]) -> None: + """Stop resources for all LightningWorks in this app. + + The Works are stopped rather than deleted so that they can be inspected for debugging. + """ + cloud_works = self._get_cloud_work_specs(self.client) + + for cloud_work in cloud_works: + self._stop_work(cloud_work) + + def all_works_stopped(works: List[Externalv1Lightningwork]) -> bool: + for work in works: + # deleted work won't be in the request hence only checking for stopped & failed + if work.status.phase not in ( + V1LightningworkState.STOPPED, + V1LightningworkState.FAILED, + ): + return False + return True + + t0 = time() + while not all_works_stopped(self._get_cloud_work_specs(self.client)): + # Wait a little.. + print("Waiting for works to stop...") + sleep(3) + + # Break if we reached timeout. + if time() - t0 > LIGHTNING_STOP_TIMEOUT: + break + + def resolve_url(self, app, base_url: Optional[str] = None) -> None: + if not self.base_url: + self.base_url = base_url + + for flow in app.flows: + if self.base_url: + # Replacing the path with complete URL + if not (self.base_url.startswith("http://") or self.base_url.startswith("https://")): + raise ValueError( + "Base URL doesn't have a valid scheme, expected it to start with 'http://' or 'https://' " + ) + if isinstance(flow._layout, dict) and "target" not in flow._layout: + # FIXME: Why _check_service_url_is_ready doesn't work ? + frontend_url = urllib.parse.urljoin(self.base_url, flow.name + "/") + flow._layout["target"] = frontend_url + + for work in app.works: + if ( + work._url == "" + and work.status.stage + in ( + WorkStageStatus.RUNNING, + WorkStageStatus.SUCCEEDED, + ) + and work._internal_ip != "" + and _check_service_url_is_ready(f"http://{work._internal_ip}:{work._port}") + ): + work._url = work._future_url + + @staticmethod + def _get_proxy_scheme() -> str: + return os.environ.get("LIGHTNING_PROXY_SCHEME", "https") + + @staticmethod + def _get_app_id() -> str: + return os.environ["LIGHTNING_CLOUD_APP_ID"] + + @staticmethod + def _get_project_id() -> str: + return os.environ["LIGHTNING_CLOUD_PROJECT_ID"] + + @staticmethod + def _get_cloud_work_specs(client: LightningClient) -> List[Externalv1Lightningwork]: + list_response: V1ListLightningworkResponse = client.lightningwork_service_list_lightningwork( + project_id=CloudBackend._get_project_id(), + app_id=CloudBackend._get_app_id(), + ) + return list_response.lightningworks + + def _handle_idle_timeout(self, idle_timeout: float, work: LightningWork, resp: Externalv1Lightningwork) -> None: + if idle_timeout is None: + return + + if work.status.stage != WorkStageStatus.SUCCEEDED: + return + + if time() > (idle_timeout + work.status.timestamp): + logger.info(f"Idle Timeout {idle_timeout} has triggered. Stopping gracefully the {work.name}.") + latest_hash = work._calls["latest_call_hash"] + status = make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.PENDING) + work._calls[latest_hash]["statuses"].append(status) + self._stop_work(resp) + logger.debug(f"Stopping work: {resp.id}") + + def _register_queues(self, app, work): + super()._register_queues(app, work) + kw = {"queue_id": self.queue_id, "work_name": work.name} + app.work_queues.update({work.name: self.queues.get_work_queue(**kw)}) + + def stop_work(self, app: LightningApp, work: LightningWork) -> None: + cloud_works = self._get_cloud_work_specs(self.client) + for cloud_work in cloud_works: + if work.name == cloud_work.name: + self._stop_work(cloud_work) + + def _stop_work(self, work_resp: Externalv1Lightningwork) -> None: + spec: V1LightningworkSpec = work_resp.spec + if spec.desired_state == V1LightningworkState.DELETED: + # work is set to be deleted. Do nothing + return + if spec.desired_state == V1LightningworkState.STOPPED: + # work is set to be stopped already. Do nothing + return + if work_resp.status.phase == V1LightningworkState.FAILED: + # work is already failed. Do nothing + return + spec.desired_state = V1LightningworkState.STOPPED + self.client.lightningwork_service_update_lightningwork( + project_id=CloudBackend._get_project_id(), + id=work_resp.id, + spec_lightningapp_instance_id=CloudBackend._get_app_id(), + body=WorksIdBody(spec), + ) + print(f"Stopping {work_resp.name} ...") + + def delete_work(self, app: LightningApp, work: LightningWork) -> None: + cloud_works = self._get_cloud_work_specs(self.client) + for cloud_work in cloud_works: + if work.name == cloud_work.name: + self._delete_work(cloud_work) + + def _delete_work(self, work_resp: Externalv1Lightningwork) -> None: + spec: V1LightningworkSpec = work_resp.spec + if spec.desired_state == V1LightningworkState.DELETED: + # work is set to be deleted. Do nothing + return + spec.desired_state = V1LightningworkState.DELETED + self.client.lightningwork_service_update_lightningwork( + project_id=CloudBackend._get_project_id(), + id=work_resp.id, + spec_lightningapp_instance_id=CloudBackend._get_app_id(), + body=WorksIdBody(spec), + ) + print(f"Deleting {work_resp.name} ...") + + def update_lightning_app_frontend(self, app: "lightning.LightningApp"): # noqa: F821 + """Used to create frontend's if the app couldn't be loaded locally.""" + if not len(app.frontends.keys()): + return + + external_app_spec: "Externalv1LightningappInstance" = ( + self.client.lightningapp_instance_service_get_lightningapp_instance( + project_id=CloudBackend._get_project_id(), + id=CloudBackend._get_app_id(), + ) + ) + + frontend_specs = external_app_spec.spec.flow_servers + spec = external_app_spec.spec + if len(frontend_specs) != len(app.frontends.keys()): + frontend_specs: List[V1Flowserver] = [] + for flow_name in sorted(app.frontends.keys()): + frontend_spec = V1Flowserver(name=flow_name) + frontend_specs.append(frontend_spec) + + spec.flow_servers = frontend_specs + spec.enable_app_server = True + + logger.info("Found new frontends. Updating the app spec.") + + self.client.lightningapp_instance_service_update_lightningapp_instance( + project_id=CloudBackend._get_project_id(), + id=CloudBackend._get_app_id(), + body=AppinstancesIdBody(spec=spec), + ) + + def stop_app(self, app: "lightning.LightningApp"): # noqa: F821 + """Used to mark the App has stopped if everything has fine.""" + + external_app_spec: "Externalv1LightningappInstance" = ( + self.client.lightningapp_instance_service_get_lightningapp_instance( + project_id=CloudBackend._get_project_id(), + id=CloudBackend._get_app_id(), + ) + ) + + spec = external_app_spec.spec + spec.desired_state = V1LightningappInstanceState.STOPPED + + self.client.lightningapp_instance_service_update_lightningapp_instance( + project_id=CloudBackend._get_project_id(), + id=CloudBackend._get_app_id(), + body=AppinstancesIdBody(spec=spec), + ) + + +def _create_mount_drive_spec(work_name: str, mount: "Mount") -> V1LightningworkDrives: + if mount.protocol == "s3://": + drive_type = V1DriveType.INDEXED_S3 + source_type = V1SourceType.S3 + else: + raise RuntimeError( + f"unknown mounts protocol `{mount.protocol}`. Please verify this " + f"drive type has been configured for use in the cloud dispatcher." + ) + + return V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name=work_name, + ), + spec=V1DriveSpec( + drive_type=drive_type, + source_type=source_type, + source=mount.source, + ), + status=V1DriveStatus(), + ), + mount_location=str(mount.mount_path), + ) diff --git a/src/lightning/app/launcher/lightning_hybrid_backend.py b/src/lightning/app/launcher/lightning_hybrid_backend.py new file mode 100644 index 0000000000000..5391aca1d1566 --- /dev/null +++ b/src/lightning/app/launcher/lightning_hybrid_backend.py @@ -0,0 +1,155 @@ +import os +from typing import Optional + +from lightning_cloud.openapi import AppinstancesIdBody, Externalv1LightningappInstance + +from lightning.app.core import constants +from lightning.app.core.queues import QueuingSystem +from lightning.app.launcher.lightning_backend import CloudBackend +from lightning.app.runners.backends.backend import Backend +from lightning.app.runners.backends.mp_process import MultiProcessingBackend +from lightning.app.utilities.network import LightningClient + +if hasattr(constants, "get_cloud_queue_type"): + CLOUD_QUEUE_TYPE = constants.get_cloud_queue_type() or "redis" +else: + CLOUD_QUEUE_TYPE = "redis" + + +class CloudHybridBackend(Backend): + def __init__(self, *args, **kwargs): + super().__init__(*args, queues=QueuingSystem(CLOUD_QUEUE_TYPE), **kwargs) + cloud_backend = CloudBackend(*args, **kwargs) + kwargs.pop("queue_id") + multiprocess_backend = MultiProcessingBackend(*args, **kwargs) + + self.backends = {"cloud": cloud_backend, "multiprocess": multiprocess_backend} + self.work_to_network_configs = {} + + def create_work(self, app, work) -> None: + backend = self._get_backend(work) + if isinstance(backend, MultiProcessingBackend): + self._prepare_work_creation(app, work) + backend.create_work(app, work) + + def _prepare_work_creation(self, app, work) -> None: + app_id = self._get_app_id() + project_id = self._get_project_id() + assert project_id + + client = LightningClient() + list_apps_resp = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id) + lightning_app: Optional[Externalv1LightningappInstance] = None + + for lightningapp in list_apps_resp.lightningapps: + if lightningapp.id == app_id: + lightning_app = lightningapp + + assert lightning_app + + network_configs = lightning_app.spec.network_config + + index = len(self.work_to_network_configs) + + if work.name not in self.work_to_network_configs: + self.work_to_network_configs[work.name] = network_configs[index] + + # Enable Ingress and update the specs. + lightning_app.spec.network_config[index].enable = True + + client.lightningapp_instance_service_update_lightningapp_instance( + project_id=project_id, + id=lightning_app.id, + body=AppinstancesIdBody(name=lightning_app.name, spec=lightning_app.spec), + ) + + work_network_config = self.work_to_network_configs[work.name] + + work._host = "0.0.0.0" # noqa: S104 + work._port = work_network_config.port + work._future_url = f"{self._get_proxy_scheme()}://{work_network_config.host}" + + def update_work_statuses(self, works) -> None: + if works: + backend = self._get_backend(works[0]) + backend.update_work_statuses(works) + + def stop_all_works(self, works) -> None: + if works: + backend = self._get_backend(works[0]) + backend.stop_all_works(works) + + def resolve_url(self, app, base_url: Optional[str] = None) -> None: + works = app.works + if works: + backend = self._get_backend(works[0]) + backend.resolve_url(app, base_url) + + def update_lightning_app_frontend(self, app: "lightning.LightningApp"): # noqa: F821 + self.backends["cloud"].update_lightning_app_frontend(app) + + def stop_work(self, app, work) -> None: + backend = self._get_backend(work) + if isinstance(backend, MultiProcessingBackend): + self._prepare_work_stop(app, work) + backend.stop_work(app, work) + + def delete_work(self, app, work) -> None: + backend = self._get_backend(work) + if isinstance(backend, MultiProcessingBackend): + self._prepare_work_stop(app, work) + backend.delete_work(app, work) + + def _prepare_work_stop(self, app, work): + app_id = self._get_app_id() + project_id = self._get_project_id() + assert project_id + + client = LightningClient() + list_apps_resp = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id) + lightning_app: Optional[Externalv1LightningappInstance] = None + + for lightningapp in list_apps_resp.lightningapps: + if lightningapp.id == app_id: + lightning_app = lightningapp + + assert lightning_app + + network_config = self.work_to_network_configs[work.name] + + for nc in lightning_app.spec.network_config: + if nc.host == network_config.host: + nc.enable = False + + client.lightningapp_instance_service_update_lightningapp_instance( + project_id=project_id, + id=lightning_app.id, + body=AppinstancesIdBody(name=lightning_app.name, spec=lightning_app.spec), + ) + + del self.work_to_network_configs[work.name] + + def _register_queues(self, app, work): + backend = self._get_backend(work) + backend._register_queues(app, work) + + def _get_backend(self, work): + if work.cloud_compute.id == "default": + return self.backends["multiprocess"] + return self.backends["cloud"] + + @staticmethod + def _get_proxy_scheme() -> str: + return os.environ.get("LIGHTNING_PROXY_SCHEME", "https") + + @staticmethod + def _get_app_id() -> str: + return os.environ["LIGHTNING_CLOUD_APP_ID"] + + @staticmethod + def _get_project_id() -> str: + return os.environ["LIGHTNING_CLOUD_PROJECT_ID"] + + def stop_app(self, app: "lightning.LightningApp"): # noqa: F821 + """Used to mark the App has stopped if everything has fine.""" + self.backends["cloud"].stop_app(app) diff --git a/src/lightning/app/utilities/packaging/lightning_utils.py b/src/lightning/app/utilities/packaging/lightning_utils.py index c023e80776678..3852c941ed676 100644 --- a/src/lightning/app/utilities/packaging/lightning_utils.py +++ b/src/lightning/app/utilities/packaging/lightning_utils.py @@ -138,17 +138,6 @@ def _prepare_lightning_wheels_and_requirements(root: Path, package_name: str = " # Don't skip by default if (PACKAGE_LIGHTNING or is_lightning) and not bool(int(os.getenv("SKIP_LIGHTING_UTILITY_WHEELS_BUILD", "0"))): - # building and copying launcher wheel if installed in editable mode - launcher_project_path = get_dist_path_if_editable_install("lightning_launcher") - if launcher_project_path: - from lightning_launcher.__version__ import __version__ as launcher_version - - # todo: check why logging.info is missing in outputs - print(f"Packaged Lightning Launcher with your application. Version: {launcher_version}") - _prepare_wheel(launcher_project_path) - tar_name = _copy_tar(launcher_project_path, root) - tar_files.append(os.path.join(root, tar_name)) - # building and copying lightning-cloud wheel if installed in editable mode lightning_cloud_project_path = get_dist_path_if_editable_install("lightning_cloud") if lightning_cloud_project_path: diff --git a/tests/tests_app/cli/launch_data/app_v0/__init__.py b/tests/tests_app/cli/launch_data/app_v0/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_app/cli/launch_data/app_v0/app.py b/tests/tests_app/cli/launch_data/app_v0/app.py new file mode 100644 index 0000000000000..7a8a4f27ced46 --- /dev/null +++ b/tests/tests_app/cli/launch_data/app_v0/app.py @@ -0,0 +1,51 @@ +# v0_app.py +import os +from datetime import datetime +from time import sleep + +import lightning as L +from lightning.app.frontend.web import StaticWebFrontend + + +class Word(L.LightningFlow): + def __init__(self, letter): + super().__init__() + self.letter = letter + self.repeats = letter + + def run(self): + self.repeats += self.letter + + def configure_layout(self): + return StaticWebFrontend(serve_dir=os.path.join(os.path.dirname(__file__), f"ui/{self.letter}")) + + +class V0App(L.LightningFlow): + def __init__(self): + super().__init__() + self.aas = Word("a") + self.bbs = Word("b") + self.counter = 0 + + def run(self): + now = datetime.now() + now = now.strftime("%H:%M:%S") + log = {"time": now, "a": self.aas.repeats, "b": self.bbs.repeats} + print(log) + self.aas.run() + self.bbs.run() + + sleep(2.0) + self.counter += 1 + + def configure_layout(self): + tab1 = {"name": "Tab_1", "content": self.aas} + tab2 = {"name": "Tab_2", "content": self.bbs} + tab3 = { + "name": "Tab_3", + "content": "https://tensorboard.dev/experiment/8m1aX0gcQ7aEmH0J7kbBtg/#scalars", + } + return [tab1, tab2, tab3] + + +app = L.LightningApp(V0App()) diff --git a/tests/tests_app/cli/launch_data/app_v0/ui/a/index.html b/tests/tests_app/cli/launch_data/app_v0/ui/a/index.html new file mode 100644 index 0000000000000..6ddb9a5a1323c --- /dev/null +++ b/tests/tests_app/cli/launch_data/app_v0/ui/a/index.html @@ -0,0 +1 @@ +