From 1438255dea9897658855bf43fe3e908175a07a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 13 Dec 2022 14:50:08 +0100 Subject: [PATCH] Introduce `{Work,Flow}.lightningignore` (#15818) (cherry picked from commit edd2b4259a3074be0ab14ac5e808d796a1d5f3f9) --- .../run_app_on_cloud/cloud_files.rst | 13 +- src/lightning_app/CHANGELOG.md | 2 + .../components/multi_node/trainer.py | 3 + src/lightning_app/core/flow.py | 24 +++- src/lightning_app/core/work.py | 25 +++- src/lightning_app/runners/cloud.py | 52 +++++--- src/lightning_app/source_code/copytree.py | 6 +- src/lightning_app/source_code/local.py | 9 +- src/lightning_app/utilities/app_helpers.py | 6 +- tests/tests_app/runners/test_cloud.py | 115 +++++++++++++++--- 10 files changed, 214 insertions(+), 41 deletions(-) diff --git a/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst b/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst index 3130cd0f336b3..dfef0dc1c13aa 100644 --- a/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst +++ b/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst @@ -30,7 +30,6 @@ For example, the source code directory below with the ``.lightningignore`` file ├── requirements.txt └── model.pt - .. code:: bash ~/project/home ❯ cat .lightningignore @@ -39,6 +38,18 @@ For example, the source code directory below with the ``.lightningignore`` file A sample ``.lightningignore`` file can be found `here `_. +If you are a component author and your components creates local files that you want to ignore, you can do: + +.. code-block:: python + + class MyComponent(L.LightningWork): # or L.LightningFlow + def __init__(self): + super().__init__() + self.lightningignore = ("model.pt", "data_dir") + + +This has the benefit that the files will be ignored automatically for all the component users, making an easier +transition between running locally vs in the cloud. ---- diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 46ed854a49295..f93f4a0a8d0fd 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added `Lightning{Flow,Work}.lightningignores` attributes to programmatically ignore files before uploading to the cloud ([#15818](https://github.com/Lightning-AI/lightning/pull/15818)) + ### Changed diff --git a/src/lightning_app/components/multi_node/trainer.py b/src/lightning_app/components/multi_node/trainer.py index 76d744e24608c..e3f738abad329 100644 --- a/src/lightning_app/components/multi_node/trainer.py +++ b/src/lightning_app/components/multi_node/trainer.py @@ -114,3 +114,6 @@ def __init__( cloud_compute=cloud_compute, **work_kwargs, ) + + # the Trainer enables TensorBoard by default, so this is often an undesired directory to upload to the cloud + self.lightningignore += ("lightning_logs",) diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index 67854b5555831..302ba344320d1 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -10,7 +10,13 @@ from lightning_app.frontend import Frontend from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive -from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden +from lightning_app.utilities.app_helpers import ( + _is_json_serializable, + _lightning_dispatched, + _LightningAppRef, + _set_child_name, + is_overridden, +) from lightning_app.utilities.component import _sanitize_state from lightning_app.utilities.exceptions import ExitAppException from lightning_app.utilities.introspection import _is_init_context, _is_run_context @@ -104,6 +110,8 @@ def __init__(self): self._layout: Union[List[Dict], Dict] = {} self._paths = {} self._backend: Optional[Backend] = None + # tuple instead of a list so that it cannot be modified without using the setter + self._lightningignore: Tuple[str, ...] = tuple() @property def name(self): @@ -310,6 +318,20 @@ def flows(self) -> Dict[str, "LightningFlow"]: flows.update(getattr(self, struct_name).flows) return flows + @property + def lightningignore(self) -> Tuple[str, ...]: + """Programmatic equivalent of the ``.lightningignore`` file.""" + return self._lightningignore + + @lightningignore.setter + def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: + if _lightning_dispatched(): + raise RuntimeError( + f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" + " effect" + ) + self._lightningignore = lightningignore + def works(self, recurse: bool = True) -> List[LightningWork]: """Return its :class:`~lightning_app.core.work.LightningWork`.""" works = [getattr(self, el) for el in sorted(self._works)] diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 60d1ea62d8afb..43ffc0006d5ea 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -3,7 +3,7 @@ import warnings from copy import deepcopy from functools import partial, wraps -from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, Union from deepdiff import DeepHash, Delta @@ -11,7 +11,12 @@ from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive from lightning_app.storage.payload import Payload -from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden +from lightning_app.utilities.app_helpers import ( + _is_json_serializable, + _lightning_dispatched, + _LightningAppRef, + is_overridden, +) from lightning_app.utilities.component import _is_flow_context, _sanitize_state from lightning_app.utilities.enum import ( CacheCallsKeys, @@ -154,6 +159,8 @@ def __init__( self._local_build_config = local_build_config or BuildConfig() self._cloud_build_config = cloud_build_config or BuildConfig() self._cloud_compute = cloud_compute or CloudCompute() + # tuple instead of a list so that it cannot be modified without using the setter + self._lightningignore: Tuple[str, ...] = tuple() self._backend: Optional[Backend] = None self._check_run_is_implemented() self._on_init_end() @@ -253,6 +260,20 @@ def cloud_compute(self, cloud_compute: CloudCompute) -> None: compute_store.remove(self.name) self._cloud_compute = cloud_compute + @property + def lightningignore(self) -> Tuple[str, ...]: + """Programmatic equivalent of the ``.lightningignore`` file.""" + return self._lightningignore + + @lightningignore.setter + def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: + if _lightning_dispatched(): + raise RuntimeError( + f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" + " effect" + ) + self._lightningignore = lightningignore + @property def status(self) -> WorkStatus: """Return the current status of the work. diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 6ef7770124aae..36d39bac1f4b8 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -5,6 +5,7 @@ import sys import time from dataclasses import dataclass +from functools import partial from pathlib import Path from textwrap import dedent from typing import Any, List, Optional, Union @@ -62,6 +63,7 @@ from lightning_app.runners.backends.cloud import CloudBackend from lightning_app.runners.runtime import Runtime from lightning_app.source_code import LocalSourceCodeDir +from lightning_app.source_code.copytree import _filter_ignored, _parse_lightningignore from lightning_app.storage import Drive, Mount from lightning_app.utilities.app_helpers import _is_headless, Logger from lightning_app.utilities.cloud import _get_project @@ -217,7 +219,19 @@ def dispatch( root = Path(self.entrypoint_file).absolute().parent cleanup_handle = _prepare_lightning_wheels_and_requirements(root) self.app._update_index_file() - repo = LocalSourceCodeDir(path=root) + + # gather and merge all lightningignores + children = self.app.flows + self.app.works + lightningignores = [c.lightningignore for c in children] + if lightningignores: + merged = sum(lightningignores, tuple()) + logger.debug(f"Found the following lightningignores: {merged}") + patterns = _parse_lightningignore(merged) + ignore_functions = [partial(_filter_ignored, root, patterns)] + else: + ignore_functions = None + + repo = LocalSourceCodeDir(path=root, ignore_functions=ignore_functions) self._check_uploaded_folder(root, repo) requirements_file = root / "requirements.txt" # The entry point file needs to be relative to the root of the uploaded source file directory, @@ -493,24 +507,34 @@ def _ensure_cluster_project_binding(self, project_id: str, cluster_id: str): @staticmethod def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: """This method is used to inform the users if their folder files are large and how to filter them.""" - lightning_tar = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz")) - app_folder_size = sum(Path(p).stat().st_size for p in repo.files if p not in lightning_tar) - app_folder_size_in_mb = round(app_folder_size / (1000 * 1000), 5) + excludes = set(fnmatch.filter(repo.files, "*lightning-*.tar.gz")) + excludes.update(fnmatch.filter(repo.files, ".lightningignore")) + files = [Path(f) for f in repo.files if f not in excludes] + file_sizes = {f: f.stat().st_size for f in files} + mb = 1000_000 + app_folder_size_in_mb = sum(file_sizes.values()) / mb if app_folder_size_in_mb > CLOUD_UPLOAD_WARNING: - path_sizes = [(p, Path(p).stat().st_size / (1000 * 1000)) for p in repo.files] - largest_paths = sorted((x for x in path_sizes if x[-1] > 0.01), key=lambda x: x[1], reverse=True)[:25] - largest_paths_msg = "\n".join(f"{round(s, 5)} MB: {p}" for p, s in largest_paths) + # filter out files under 0.01mb + relevant_files = {f: sz for f, sz in file_sizes.items() if sz > 0.01 * mb} + if relevant_files: + by_largest = dict(sorted(relevant_files.items(), key=lambda x: x[1], reverse=True)) + by_largest = dict(list(by_largest.items())[:25]) # trim + largest_paths_msg = "\n".join( + f"{round(sz / mb, 5)} MB: {p.relative_to(root)}" for p, sz in by_largest.items() + ) + largest_paths_msg = f"Here are the largest files:\n{largest_paths_msg}\n" + else: + largest_paths_msg = "" warning_msg = ( f"Your application folder '{root.absolute()}' is more than {CLOUD_UPLOAD_WARNING} MB. " - f"The total size is {app_folder_size_in_mb} MB\n" - f"Here are the largest files: \n{largest_paths_msg}\n" - "Perhaps you should try running the app in an empty directory." + f"The total size is {round(app_folder_size_in_mb, 2)} MB. {len(files)} files were uploaded.\n" + + largest_paths_msg + + "Perhaps you should try running the app in an empty directory." ) if not (root / DOT_IGNORE_FILENAME).is_file(): - warning_msg = ( - warning_msg - + "\nIn order to ignore some files or folder, " - + "create a `.lightningignore` file and add the paths to ignore." + warning_msg += ( + "\nIn order to ignore some files or folder, create a `.lightningignore` file and add the paths to" + " ignore. You can also set the `lightningingore` attribute in a Flow or Work." ) else: warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`." diff --git a/src/lightning_app/source_code/copytree.py b/src/lightning_app/source_code/copytree.py index 554537849f4cf..7435c332b50f6 100644 --- a/src/lightning_app/source_code/copytree.py +++ b/src/lightning_app/source_code/copytree.py @@ -3,18 +3,20 @@ from functools import partial from pathlib import Path from shutil import copy2, copystat, Error -from typing import Callable, List, Set, Union +from typing import Callable, List, Optional, Set, Union from lightning_app.core.constants import DOT_IGNORE_FILENAME from lightning_app.utilities.app_helpers import Logger logger = Logger(__name__) +_IGNORE_FUNCTION = Callable[[Path, List[Path]], List[Path]] + def _copytree( src: Union[Path, str], dst: Union[Path, str], - ignore_functions: List[Callable] = None, + ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None, dirs_exist_ok=False, dry_run=False, ) -> List[str]: diff --git a/src/lightning_app/source_code/local.py b/src/lightning_app/source_code/local.py index b461d3814e9db..79d655cefbc06 100644 --- a/src/lightning_app/source_code/local.py +++ b/src/lightning_app/source_code/local.py @@ -4,7 +4,7 @@ from shutil import rmtree from typing import List, Optional -from lightning_app.source_code.copytree import _copytree +from lightning_app.source_code.copytree import _copytree, _IGNORE_FUNCTION from lightning_app.source_code.hashing import _get_hash from lightning_app.source_code.tar import _tar_path from lightning_app.source_code.uploader import FileUploader @@ -15,8 +15,9 @@ class LocalSourceCodeDir: cache_location: Path = Path.home() / ".lightning" / "cache" / "repositories" - def __init__(self, path: Path): + def __init__(self, path: Path, ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None) -> None: self.path = path + self.ignore_functions = ignore_functions # cache checksum version self._version: Optional[str] = None @@ -33,7 +34,7 @@ def __init__(self, path: Path): def files(self) -> List[str]: """Returns a set of files that are not ignored by .lightningignore.""" if self._non_ignored_files is None: - self._non_ignored_files = _copytree(self.path, "", dry_run=True) + self._non_ignored_files = _copytree(self.path, "", ignore_functions=self.ignore_functions, dry_run=True) return self._non_ignored_files @property @@ -59,7 +60,7 @@ def packaging_session(self) -> Path: session_path = self.cache_location / "packaging_sessions" / self.version try: rmtree(session_path, ignore_errors=True) - _copytree(self.path, session_path) + _copytree(self.path, session_path, ignore_functions=self.ignore_functions) yield session_path finally: rmtree(session_path, ignore_errors=True) diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 83b78e1929aa5..3b152786b682a 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -511,11 +511,15 @@ def is_static_method(klass_or_instance, attr) -> bool: return isinstance(inspect.getattr_static(klass_or_instance, attr), staticmethod) +def _lightning_dispatched() -> bool: + return bool(int(os.getenv("LIGHTNING_DISPATCHED", 0))) + + def _should_dispatch_app() -> bool: return ( __debug__ and "_pytest.doctest" not in sys.modules - and not bool(int(os.getenv("LIGHTNING_DISPATCHED", "0"))) + and not _lightning_dispatched() and "LIGHTNING_APP_STATE_URL" not in os.environ ) diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index e89e1e8aa468d..7ce4ea397d95b 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -43,13 +43,15 @@ V1Work, ) -from lightning_app import BuildConfig, LightningApp, LightningWork +from lightning_app import BuildConfig, LightningApp, LightningFlow, LightningWork from lightning_app.runners import backends, cloud, CloudRuntime from lightning_app.runners.cloud import ( _generate_works_json_gallery, _generate_works_json_web, _validate_build_spec_and_compute, ) +from lightning_app.source_code.copytree import _copytree, _parse_lightningignore +from lightning_app.source_code.local import LocalSourceCodeDir from lightning_app.storage import Drive, Mount from lightning_app.testing.helpers import EmptyWork from lightning_app.utilities.cloud import _get_project @@ -1247,31 +1249,38 @@ def test_get_project(monkeypatch): assert ret.project_id == "test-project-id1" +def write_file_of_size(path, size): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as f: + f.seek(size) + f.write(b"\0") + + @mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock()) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock()) def test_check_uploaded_folder(monkeypatch, tmpdir, caplog): - - monkeypatch.setattr(cloud, "logger", logging.getLogger()) - app = MagicMock() - repo = MagicMock() + root = Path(tmpdir) + repo = LocalSourceCodeDir(root) backend = cloud.CloudRuntime(app) with caplog.at_level(logging.WARN): - backend._check_uploaded_folder(Path(tmpdir), repo) + backend._check_uploaded_folder(root, repo) assert caplog.messages == [] - mock = MagicMock() - mock.st_mode = 33188 - mock.st_size = 5 * 1000 * 1000 - repo.files = [str(Path("./a.png"))] - monkeypatch.setattr(Path, "stat", MagicMock(return_value=mock)) + # write some files to assert the message below. + write_file_of_size(root / "a.png", 4 * 1000 * 1000) + write_file_of_size(root / "b.txt", 5 * 1000 * 1000) + write_file_of_size(root / "c.jpg", 6 * 1000 * 1000) - path = Path(".") + repo._non_ignored_files = None # force reset with caplog.at_level(logging.WARN): - backend._check_uploaded_folder(path, repo) - assert caplog.messages[0].startswith( - f"Your application folder '{path.absolute()}' is more than 2 MB. The total size is 5.0 MB" - ) + backend._check_uploaded_folder(root, repo) + assert f"Your application folder '{root.absolute()}' is more than 2 MB" in caplog.text + assert "The total size is 15.0 MB" in caplog.text + assert "3 files were uploaded" in caplog.text + assert "files:\n6.0 MB: c.jpg\n5.0 MB: b.txt\n4.0 MB: a.png\nPerhaps" in caplog.text # tests the order + assert "create a `.lightningignore` file" in caplog.text + assert "lightningingore` attribute in a Flow or Work" in caplog.text @mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock()) @@ -1433,6 +1442,80 @@ def run(self): _validate_build_spec_and_compute(Work()) +def test_programmatic_lightningignore(monkeypatch, caplog, tmpdir): + monkeypatch.setenv("LIGHTNING_DISPATCHED", "0") # this is not cleaned up + + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="test-project", project_id="test-project-id")] + ) + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( + cluster_id="test" + ) + cloud_backend = mock.MagicMock(client=mock_client) + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + + class MyWork(LightningWork): + def __init__(self): + super().__init__() + self.lightningignore += ("foo", "lightning_logs") + + def run(self): + with pytest.raises(RuntimeError, match="w.lightningignore` does not"): + self.lightningignore += ("foobar",) + + class MyFlow(LightningFlow): + def __init__(self): + super().__init__() + self.lightningignore = ("foo",) + self.w = MyWork() + + def run(self): + with pytest.raises(RuntimeError, match="root.lightningignore` does not"): + self.lightningignore = ("baz",) + self.w.run() + + flow = MyFlow() + app = LightningApp(flow) + + monkeypatch.setattr(app, "_update_index_file", mock.MagicMock()) + + path = Path(tmpdir) + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=path / "entrypoint.py") + monkeypatch.setattr(LocalSourceCodeDir, "upload", mock.MagicMock()) + + # write some files + write_file_of_size(path / "a.txt", 5 * 1000 * 1000) + write_file_of_size(path / "foo.png", 4 * 1000 * 1000) + write_file_of_size(path / "lightning_logs" / "foo.ckpt", 6 * 1000 * 1000) + # also an actual .lightningignore file + (path / ".lightningignore").write_text("foo.png") + + with mock.patch( + "lightning_app.runners.cloud._parse_lightningignore", wraps=_parse_lightningignore + ) as parse_mock, mock.patch( + "lightning_app.source_code.local._copytree", wraps=_copytree + ) as copy_mock, caplog.at_level( + logging.WARN + ): + cloud_runtime.dispatch() + + parse_mock.assert_called_once_with(("foo", "foo", "lightning_logs")) + assert copy_mock.mock_calls[0].kwargs["ignore_functions"][0].args[1] == {"lightning_logs", "foo"} + + assert f"Your application folder '{path.absolute()}' is more than 2 MB" in caplog.text + assert "The total size is 5.0 MB" in caplog.text + assert "2 files were uploaded" # a.txt and .lightningignore + assert "files:\n5.0 MB: a.txt\nPerhaps" in caplog.text # only this file appears + + # replicate how the app would dispatch the app, and call `run` + monkeypatch.setenv("LIGHTNING_DISPATCHED", "1") + flow.run() + + @pytest.mark.parametrize( "lightning_app_instance, lightning_cloud_url, expected_url", [