Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[App] Raise error when launching app on multiple clusters #15484

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
32eb9fc
Error when running on multiple clusters
luca3rd Nov 2, 2022
5a60a9d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 2, 2022
d9bc71c
Revert this in separate PR: keep this focused
luca3rd Nov 5, 2022
cad231f
Improve testing
luca3rd Nov 5, 2022
0aa51d8
fixup! Improve testing
luca3rd Nov 6, 2022
9de897e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 6, 2022
18b8b70
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 7, 2022
ac9ccab
pass flake8
luca3rd Nov 7, 2022
97eb392
Update changelog
luca3rd Nov 7, 2022
17af0f8
Address PR feedback
luca3rd Nov 9, 2022
be6b745
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 9, 2022
2af76cd
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 9, 2022
58c733f
remove unused import
luca3rd Nov 9, 2022
234c782
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 10, 2022
d7bd429
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 21, 2022
7a093a8
Reword error message
luca3rd Nov 21, 2022
3474c5b
Error if running on cluster that doesn't exist
luca3rd Nov 21, 2022
9cd61e5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 21, 2022
0701419
fixup! Error if running on cluster that doesn't exist
luca3rd Nov 21, 2022
165d4ba
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 21, 2022
74945de
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
Borda Nov 21, 2022
e61ac3d
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 28, 2022
bc9fb75
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 28, 2022
511fac2
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
luca3rd Nov 28, 2022
cc6f4e9
Remove unsued import
luca3rd Nov 29, 2022
09e06f4
Merge branch 'master' into ENG-497-bug-cant-run-app-on-a-cluster-if-i…
Borda Nov 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/lightning_app/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [1.8.1] - 2022-11-10

- Fixed bug when launching apps on multiple clusters ([#15484](https://github.com/Lightning-AI/lightning/pull/15484))


### Added

Expand Down
22 changes: 16 additions & 6 deletions src/lightning_app/runners/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
import time
from dataclasses import dataclass
from pathlib import Path
from textwrap import dedent
from typing import Any, Callable, List, Optional, Union

import click
from lightning_cloud.openapi import (
Body3,
Body4,
Body7,
Body8,
Body9,
Externalv1LightningappInstance,
Gridv1ImageSpec,
V1BuildSpec,
V1DependencyFileInfo,
Expand Down Expand Up @@ -336,6 +337,7 @@ def dispatch(
elif CLOUD_QUEUE_TYPE == "redis":
queue_server_type = V1QueueServerType.REDIS

existing_instance: Optional[Externalv1LightningappInstance] = None
if find_instances_resp.lightningapps:
existing_instance = find_instances_resp.lightningapps[0]

Expand Down Expand Up @@ -374,12 +376,20 @@ def dispatch(
f"Your app last ran on cluster {app_config.cluster_id}, but that cluster "
"doesn't exist anymore."
)
click.confirm(
f"{msg} Do you want to run on Lightning Cloud instead?",
abort=True,
default=True,
raise ValueError(msg)
if existing_instance and existing_instance.spec.cluster_id != app_config.cluster_id:
luca3rd marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
dedent(
f"""\
An app names {app_config.name} is already running on cluster {existing_instance.spec.cluster_id}, and you requested it to run on cluster {app_config.cluster_id}.

In order to proceed, please either:
a. rename the app to run on {app_config.cluster_id} with the --name option
lightning run app {app_entrypoint_file} --name (new name) --cloud --cluster-id {app_config.cluster_id}
b. delete the app running on {existing_instance.spec.cluster_id} in the UI before running this command.
lantiga marked this conversation as resolved.
Show resolved Hide resolved
""" # noqa: E501
)
)
app_config.cluster_id = None

if app_config.cluster_id is not None:
self._ensure_cluster_project_binding(project.project_id, app_config.cluster_id)
Expand Down
120 changes: 83 additions & 37 deletions tests/tests_app/runners/test_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import sys
from contextlib import nullcontext as does_not_raise
from copy import copy
from pathlib import Path
from unittest import mock
Expand Down Expand Up @@ -101,75 +102,120 @@ def get_cloud_runtime_request_body(**kwargs) -> "Body8":
return Body8(**default_request_body)


@pytest.fixture
def cloud_backend(monkeypatch):
cloud_backend = mock.MagicMock()
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))
return cloud_backend


@pytest.fixture
def project_id():
return "test-project-id"


DEFAULT_CLUSTER = "litng-ai-03"


class TestAppCreationClient:
"""Testing the calls made using GridRestClient to create the app."""

def test_run_on_deleted_cluster(self, cloud_backend):
app_name = "test-app"

mock_client = mock.MagicMock()
mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse(
memberships=[V1Membership(name="Default Project", project_id=project_id)]
)

mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse(
[
Externalv1Cluster(id=DEFAULT_CLUSTER),
]
)
cloud_backend.client = mock_client

app = mock.MagicMock()
app.flows = []
app.frontend = {}

existing_instance = MagicMock()
existing_instance.status.phase = V1LightningappInstanceState.STOPPED
existing_instance.spec.cluster_id = DEFAULT_CLUSTER
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
V1ListLightningappInstancesResponse(lightningapps=[existing_instance])
)

cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py")
cloud_runtime._check_uploaded_folder = mock.MagicMock()

with pytest.raises(ValueError, match="that cluster doesn't exist"):
cloud_runtime.dispatch(name=app_name, cluster_id="unknown-cluster")

# TODO: remove this test once there is support for multiple instances
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
def test_new_instance_on_different_cluster(self, monkeypatch):
app_name = "test-app-name"
original_cluster = "cluster-001"
new_cluster = "cluster-002"
@pytest.mark.parametrize(
"old_cluster,new_cluster,expected_raise",
[
(
"test",
"other",
pytest.raises(
ValueError,
match="already running on cluster",
),
),
("test", "test", does_not_raise()),
(None, None, does_not_raise()),
(None, "litng-ai-03", does_not_raise()),
("litng-ai-03", None, does_not_raise()),
],
)
def test_new_instance_on_different_cluster(
self, cloud_backend, project_id, old_cluster, new_cluster, expected_raise
):
app_name = "test-app"

mock_client = mock.MagicMock()
mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse(
memberships=[V1Membership(name="Default Project", project_id="default-project-id")]
memberships=[V1Membership(name="Default Project", project_id=project_id)]
)
mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease(
cluster_id=new_cluster
)

# Note:
# backend converts "None" cluster to "litng-ai-03"
# dispatch should receive None, but API calls should return "litng-ai-03"
mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse(
[Externalv1Cluster(id=original_cluster), Externalv1Cluster(id=new_cluster)]
[
Externalv1Cluster(id=old_cluster or DEFAULT_CLUSTER),
luca3rd marked this conversation as resolved.
Show resolved Hide resolved
Externalv1Cluster(id=new_cluster or DEFAULT_CLUSTER),
]
)

cloud_backend = mock.MagicMock()
cloud_backend.client = mock_client
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))

app = mock.MagicMock()
app.flows = []
app.frontend = {}

existing_instance = MagicMock()
existing_instance.status.phase = V1LightningappInstanceState.STOPPED
existing_instance.spec.cluster_id = original_cluster
existing_instance.spec.cluster_id = old_cluster or DEFAULT_CLUSTER
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
V1ListLightningappInstancesResponse(lightningapps=[existing_instance])
)

cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py")
cloud_runtime._check_uploaded_folder = mock.MagicMock()

# without requirements file
# setting is_file to False so requirements.txt existence check will return False
monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False)
monkeypatch.setattr(cloud, "Path", Path)

# This is the main assertion:
# we have an existing instance on `cluster-001`
# but we want to run this app on `cluster-002`
cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster)

body = Body8(
cluster_id=new_cluster,
app_entrypoint_file=mock.ANY,
enable_app_server=True,
flow_servers=[],
image_spec=None,
works=[],
local_source=True,
dependency_cache_key=mock.ANY,
user_requested_flow_compute_config=mock.ANY,
)
cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
project_id="default-project-id", app_id=mock.ANY, body=body
)
cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with(
project_id="default-project-id",
body=V1ProjectClusterBinding(cluster_id=new_cluster, project_id="default-project-id"),
)
with expected_raise:
cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster)

@pytest.mark.parametrize("flow_cloud_compute", [None, CloudCompute(name="t2.medium")])
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
Expand Down