From c5d3bba7643645c7d319ea7c8f865e4069390ce0 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Wed, 30 Nov 2022 11:33:57 -0500 Subject: [PATCH 1/7] [App] Raise error when launching app on multiple clusters (#15484) * Error when running on multiple clusters * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert this in separate PR: keep this focused * Improve testing * fixup! Improve testing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pass flake8 * Update changelog * Address PR feedback * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused import * Reword error message * Error if running on cluster that doesn't exist * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixup! Error if running on cluster that doesn't exist * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unsued import Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/runners/cloud.py | 22 +++-- tests/tests_app/runners/test_cloud.py | 120 ++++++++++++++++++-------- 3 files changed, 101 insertions(+), 43 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index cf115dd5e285f6..59acf63a1fee0c 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -94,6 +94,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.8.1] - 2022-11-10 +- Fixed bug when launching apps on multiple clusters ([#15484](https://github.com/Lightning-AI/lightning/pull/15484)) + ### Added diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 0752a3b5be8a89..d3613975f6076a 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -6,15 +6,16 @@ import time from dataclasses import dataclass from pathlib import Path +from textwrap import dedent from typing import Any, Callable, List, Optional, Union -import click from lightning_cloud.openapi import ( Body3, Body4, Body7, Body8, Body9, + Externalv1LightningappInstance, Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, @@ -336,6 +337,7 @@ def dispatch( elif CLOUD_QUEUE_TYPE == "redis": queue_server_type = V1QueueServerType.REDIS + existing_instance: Optional[Externalv1LightningappInstance] = None if find_instances_resp.lightningapps: existing_instance = find_instances_resp.lightningapps[0] @@ -374,12 +376,20 @@ def dispatch( f"Your app last ran on cluster {app_config.cluster_id}, but that cluster " "doesn't exist anymore." ) - click.confirm( - f"{msg} Do you want to run on Lightning Cloud instead?", - abort=True, - default=True, + raise ValueError(msg) + if existing_instance and existing_instance.spec.cluster_id != app_config.cluster_id: + raise ValueError( + dedent( + f"""\ + An app names {app_config.name} is already running on cluster {existing_instance.spec.cluster_id}, and you requested it to run on cluster {app_config.cluster_id}. + + In order to proceed, please either: + a. rename the app to run on {app_config.cluster_id} with the --name option + lightning run app {app_entrypoint_file} --name (new name) --cloud --cluster-id {app_config.cluster_id} + b. delete the app running on {existing_instance.spec.cluster_id} in the UI before running this command. + """ # noqa: E501 + ) ) - app_config.cluster_id = None if app_config.cluster_id is not None: self._ensure_cluster_project_binding(project.project_id, app_config.cluster_id) diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index d1a6f9daaffe13..1f525341224eb9 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -2,6 +2,7 @@ import os import re import sys +from contextlib import nullcontext as does_not_raise from copy import copy from pathlib import Path from unittest import mock @@ -101,32 +102,100 @@ def get_cloud_runtime_request_body(**kwargs) -> "Body8": return Body8(**default_request_body) +@pytest.fixture +def cloud_backend(monkeypatch): + cloud_backend = mock.MagicMock() + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + return cloud_backend + + +@pytest.fixture +def project_id(): + return "test-project-id" + + +DEFAULT_CLUSTER = "litng-ai-03" + + class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" + def test_run_on_deleted_cluster(self, cloud_backend): + app_name = "test-app" + + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="Default Project", project_id=project_id)] + ) + + mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( + [ + Externalv1Cluster(id=DEFAULT_CLUSTER), + ] + ) + cloud_backend.client = mock_client + + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + existing_instance.spec.cluster_id = DEFAULT_CLUSTER + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[existing_instance]) + ) + + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + with pytest.raises(ValueError, match="that cluster doesn't exist"): + cloud_runtime.dispatch(name=app_name, cluster_id="unknown-cluster") + # TODO: remove this test once there is support for multiple instances - @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) - def test_new_instance_on_different_cluster(self, monkeypatch): - app_name = "test-app-name" - original_cluster = "cluster-001" - new_cluster = "cluster-002" + @pytest.mark.parametrize( + "old_cluster,new_cluster,expected_raise", + [ + ( + "test", + "other", + pytest.raises( + ValueError, + match="already running on cluster", + ), + ), + ("test", "test", does_not_raise()), + (None, None, does_not_raise()), + (None, "litng-ai-03", does_not_raise()), + ("litng-ai-03", None, does_not_raise()), + ], + ) + def test_new_instance_on_different_cluster( + self, cloud_backend, project_id, old_cluster, new_cluster, expected_raise + ): + app_name = "test-app" mock_client = mock.MagicMock() mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( - memberships=[V1Membership(name="Default Project", project_id="default-project-id")] + memberships=[V1Membership(name="Default Project", project_id=project_id)] ) mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( cluster_id=new_cluster ) + + # Note: + # backend converts "None" cluster to "litng-ai-03" + # dispatch should receive None, but API calls should return "litng-ai-03" mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( - [Externalv1Cluster(id=original_cluster), Externalv1Cluster(id=new_cluster)] + [ + Externalv1Cluster(id=old_cluster or DEFAULT_CLUSTER), + Externalv1Cluster(id=new_cluster or DEFAULT_CLUSTER), + ] ) - cloud_backend = mock.MagicMock() cloud_backend.client = mock_client - monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) - monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) - monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) app = mock.MagicMock() app.flows = [] @@ -134,7 +203,7 @@ def test_new_instance_on_different_cluster(self, monkeypatch): existing_instance = MagicMock() existing_instance.status.phase = V1LightningappInstanceState.STOPPED - existing_instance.spec.cluster_id = original_cluster + existing_instance.spec.cluster_id = old_cluster or DEFAULT_CLUSTER mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=[existing_instance]) ) @@ -142,34 +211,11 @@ def test_new_instance_on_different_cluster(self, monkeypatch): cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") cloud_runtime._check_uploaded_folder = mock.MagicMock() - # without requirements file - # setting is_file to False so requirements.txt existence check will return False - monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False) - monkeypatch.setattr(cloud, "Path", Path) - # This is the main assertion: # we have an existing instance on `cluster-001` # but we want to run this app on `cluster-002` - cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster) - - body = Body8( - cluster_id=new_cluster, - app_entrypoint_file=mock.ANY, - enable_app_server=True, - flow_servers=[], - image_spec=None, - works=[], - local_source=True, - dependency_cache_key=mock.ANY, - user_requested_flow_compute_config=mock.ANY, - ) - cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - project_id="default-project-id", app_id=mock.ANY, body=body - ) - cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with( - project_id="default-project-id", - body=V1ProjectClusterBinding(cluster_id=new_cluster, project_id="default-project-id"), - ) + with expected_raise: + cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster) @pytest.mark.parametrize("flow_cloud_compute", [None, CloudCompute(name="t2.medium")]) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) From 58644092d23f2de33a805671bdf2c7f9142a4ab7 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Thu, 1 Dec 2022 15:23:01 +0530 Subject: [PATCH 2/7] Moving `lightning_api_access` out of base requirements (#15844) * moving the requirements to components extras * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * component requirements to devel * importing torch in local scope * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * skipping doctest Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- requirements/app/base.txt | 1 - requirements/app/components.txt | 2 ++ requirements/app/devel.txt | 3 +++ src/lightning_app/components/serve/python_server.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 requirements/app/components.txt diff --git a/requirements/app/base.txt b/requirements/app/base.txt index a3da6958b6688b..5c39e9ddf4f745 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -12,4 +12,3 @@ beautifulsoup4>=4.8.0, <4.11.2 inquirer>=2.10.0 psutil<5.9.4 click<=8.1.3 -lightning_api_access>=0.0.3 diff --git a/requirements/app/components.txt b/requirements/app/components.txt new file mode 100644 index 00000000000000..38180a480a59b3 --- /dev/null +++ b/requirements/app/components.txt @@ -0,0 +1,2 @@ +# deps required by components in the lightning app repository (src/lightning_app/components) +lightning_api_access>=0.0.3 diff --git a/requirements/app/devel.txt b/requirements/app/devel.txt index 43a7605e8b65af..deee89e23758d7 100644 --- a/requirements/app/devel.txt +++ b/requirements/app/devel.txt @@ -10,4 +10,7 @@ # extended list of dependencies for UI -r ./ui.txt +# extended list of dependencies for UI +-r ./components.txt + -r ./examples.txt diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py index 7f7a8eeea98f42..99d51ac1cf4fce 100644 --- a/src/lightning_app/components/serve/python_server.py +++ b/src/lightning_app/components/serve/python_server.py @@ -6,6 +6,7 @@ import uvicorn from fastapi import FastAPI +from lightning_utilities.core.imports import module_available from pydantic import BaseModel from starlette.staticfiles import StaticFiles @@ -17,9 +18,14 @@ logger = Logger(__name__) +__doctest_skip__ = [] +# Skip doctests if requirements aren't available +if not module_available("lightning_api_access"): + __doctest_skip__ += ["PythonServer", "PythonServer.*"] + # Skip doctests if requirements aren't available if not _is_torch_available(): - __doctest_skip__ = ["PythonServer", "PythonServer.*"] + __doctest_skip__ += ["PythonServer", "PythonServer.*"] class _PyTorchSpawnRunExecutor(WorkRunExecutor): @@ -90,7 +96,7 @@ def _get_sample_data() -> Dict[Any, Any]: class PythonServer(LightningWork, abc.ABC): - @requires("torch") + @requires(["torch", "lightning_api_access"]) def __init__( # type: ignore self, host: str = "127.0.0.1", From 5144160c846b636046accd4e6e5a25f204077360 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Thu, 1 Dec 2022 19:09:19 +0530 Subject: [PATCH 3/7] [App] Fixing Sigterm Handler causing thread lock which caused KeyboardInterrupt to hang (#15881) * terminating only once * changelog --- src/lightning_app/CHANGELOG.md | 2 ++ src/lightning_app/runners/multiprocess.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 59acf63a1fee0c..b7e01e77224cb6 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -42,6 +42,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the `enable_spawn` method of the `WorkRunExecutor` ([#15812](https://github.com/Lightning-AI/lightning/pull/15812) +- Fixed Sigterm Handler causing thread lock which caused KeyboardInterrupt to hang ([#15881](https://github.com/Lightning-AI/lightning/pull/15881)) + ## [1.8.3] - 2022-11-22 diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py index 1f7e0b906ba4bf..343996cbdd7326 100644 --- a/src/lightning_app/runners/multiprocess.py +++ b/src/lightning_app/runners/multiprocess.py @@ -27,6 +27,7 @@ class MultiProcessRuntime(Runtime): """ backend: Union[str, Backend] = "multiprocessing" + _has_triggered_termination: bool = False def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwargs: Any): """Method to dispatch and run the LightningApp.""" @@ -111,9 +112,11 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg self.app._run() except KeyboardInterrupt: self.terminate() + self._has_triggered_termination = True raise finally: - self.terminate() + if not self._has_triggered_termination: + self.terminate() def terminate(self): if APP_SERVER_IN_CLOUD: From 36b953b64349b83e7165ed3da2792a5c8e9bc4b5 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 1 Dec 2022 21:12:15 +0100 Subject: [PATCH 4/7] CI: signal lai build (#15871) --- .github/actions/pkg-publish/action.yml | 24 +++-- .github/workflows/legacy-checkpoints.yml | 18 +++- .github/workflows/release-pypi.yml | 108 ++++++++++++++++++++--- 3 files changed, 128 insertions(+), 22 deletions(-) diff --git a/.github/actions/pkg-publish/action.yml b/.github/actions/pkg-publish/action.yml index c9b202273f68c6..b9362784ae6c57 100644 --- a/.github/actions/pkg-publish/action.yml +++ b/.github/actions/pkg-publish/action.yml @@ -5,9 +5,14 @@ inputs: pkg-pattern: description: what file pattern is searched in folder, so for example `*_app*` required: true + pypi-test-token: + description: login token for PyPI + default: '' + required: false pypi-token: description: login token for PyPI - required: true + default: '' + required: false runs: using: "composite" @@ -20,16 +25,19 @@ runs: shell: bash # We do this, since failures on test.pypi aren't that bad - #- name: Publish to Test PyPI - # uses: pypa/gh-action-pypi-publish@v1.5.1 - # with: - # user: __token__ - # password: ${{ secrets.test_pypi_token_lai }} - # repository_url: https://test.pypi.org/legacy/ - # verbose: true + - name: Publish to Test PyPI + uses: pypa/gh-action-pypi-publish@v1.5.1 + if: inputs.pypi-test-token != '' + with: + user: __token__ + password: ${{ secrets.test_pypi_token_lai }} + repository_url: https://test.pypi.org/legacy/ + packages_dir: pypi/ + verbose: true - name: Publish distribution 📦 to PyPI uses: pypa/gh-action-pypi-publish@v1.5.1 + if: inputs.pypi-token != '' with: user: __token__ password: ${{ inputs.pypi-token }} diff --git a/.github/workflows/legacy-checkpoints.yml b/.github/workflows/legacy-checkpoints.yml index 9fa98cc5b10490..5e08455d22c8ff 100644 --- a/.github/workflows/legacy-checkpoints.yml +++ b/.github/workflows/legacy-checkpoints.yml @@ -41,11 +41,15 @@ on: AWS_SECRET_KEY_ID: required: true +defaults: + run: + shell: bash + jobs: create-legacy-ckpts: runs-on: ubuntu-20.04 outputs: - pl-version: ${{ steps.decide-pl-version.outputs.pl-version }} + pl-version: ${{ steps.decide-version.outputs.pl-version }} defaults: run: working-directory: tests/legacy @@ -69,16 +73,24 @@ jobs: env: PACKAGE_NAME: pytorch run: | - pip install -e . -f https://download.pytorch.org/whl/cpu/torch_stable.html + pip install . -f https://download.pytorch.org/whl/cpu/torch_stable.html pip list if: inputs.pl_version == '' + - name: Install PL version + run: | + pip install "pytorch-lightning==${{ inputs.pl_version }}" \ + -f https://download.pytorch.org/whl/cpu/torch_stable.html \ + --extra-index-url https://test.pypi.org/simple/ + pip list + if: inputs.pl_version != '' + - name: Pull legacy checkpoints working-directory: ./ run: bash .actions/pull_legacy_checkpoints.sh - name: Decide PL version to create a PR with - id: decide-pl-version + id: decide-version run: | python -c "import pytorch_lightning as pl; print(f'pl-version={pl.__version__}')" >> $GITHUB_OUTPUT || echo pl-version='' >> $GITHUB_OUTPUT diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index a3d7b85ab9935e..ba7f18341ac013 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -7,6 +7,10 @@ on: release: types: [published] +defaults: + run: + shell: bash + jobs: init: runs-on: ubuntu-20.04 @@ -24,8 +28,7 @@ jobs: runs-on: ubuntu-20.04 strategy: fail-fast: true - # run sequential - max-parallel: 1 + max-parallel: 1 # run sequential to prevent download/upload collisions matrix: pkg-name: ["lightning", "app", "lite", "pytorch"] steps: @@ -70,7 +73,69 @@ jobs: files: 'dist/*' repo-token: ${{ secrets.GITHUB_TOKEN }} - publish-packages: + release-version: + runs-on: ubuntu-20.04 + outputs: + tag: ${{ steps.lai-package.outputs.version }} + steps: + - uses: actions/checkout@v3 + - name: install Package + env: + PACKAGE_NAME: "lightning" + run: pip install . -f https://download.pytorch.org/whl/cpu/torch_stable.html + - name: package Version + id: lai-package + run: python -c "import lightning as L; print(f'version={L.__version__}')" >> $GITHUB_OUTPUT + + signaling: + runs-on: ubuntu-20.04 + needs: [release-version, pre-publish-packages] + env: + TAG: ${{ needs.release-version.outputs.tag }} + steps: + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - uses: actions/checkout@v3 + with: + repository: gridai/base-images + token: ${{ secrets.PAT_GHOST }} + ref: main + - uses: fregante/setup-git-token@v1 + with: + token: ${{ secrets.PAT_GHOST }} + name: PL Ghost + email: pl-github@grid.ai + - name: Update lightning version + run: | + import json, os + with open("versions.json") as fo: + vers = json.load(fo) + vers["lightning_version"] = os.getenv('TAG') + with open("versions.json", "w") as fw: + json.dump(vers, fw) + shell: python + - name: GIT Commit + run: | + git add versions.json + git commit -m "bumping lightning version -> ${TAG}" + cat versions.json + - name: GIT Push + run: | + git status + git push + + waiting: + # TODO: replace with back signal from build images/ loop checking for a specific branch? + runs-on: ubuntu-20.04 + needs: signaling + steps: + - name: Delay releasing + uses: juliangruber/sleep-action@v1 + with: + time: 30m + + pre-publish-packages: runs-on: ubuntu-20.04 needs: build-packages if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' @@ -82,38 +147,59 @@ jobs: path: dist - run: ls -lh dist/ - run: mkdir pypi/ - - - name: Delay releasing - uses: juliangruber/sleep-action@v1 + - uses: ./.github/actions/pkg-publish with: - time: 10m + pkg-pattern: "*app*" + pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_APP }} + - uses: ./.github/actions/pkg-publish + with: + pkg-pattern: "*lite*" + pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_LITE }} + - uses: ./.github/actions/pkg-publish + with: + pkg-pattern: "*pytorch*" + pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_PYTORCH }} + - uses: ./.github/actions/pkg-publish + with: + pkg-pattern: "*" + pypi-test-token: ${{ secrets.PYPI_TEST_TOKEN_LAI }} + publish-packages: + runs-on: ubuntu-20.04 + needs: waiting + if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + steps: + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + with: + name: dist-packages-${{ github.sha }} + path: dist + - run: ls -lh dist/ + - run: mkdir pypi/ - uses: ./.github/actions/pkg-publish with: pkg-pattern: "*app*" pypi-token: ${{ secrets.PYPI_TOKEN_APP }} - - uses: ./.github/actions/pkg-publish with: pkg-pattern: "*lite*" pypi-token: ${{ secrets.PYPI_TOKEN_LITE }} - - uses: ./.github/actions/pkg-publish with: pkg-pattern: "*pytorch*" pypi-token: ${{ secrets.PYPI_TOKEN_PYTORCH }} - - uses: ./.github/actions/pkg-publish with: pkg-pattern: "*" pypi-token: ${{ secrets.PYPI_TOKEN_LAI }} legacy-checkpoints: - needs: publish-packages + needs: [release-version, pre-publish-packages] uses: ./.github/workflows/legacy-checkpoints.yml with: push_to_s3: true create_pr: true + pl_version: ${{ needs.release-version.outputs.tag }} secrets: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_KEY_ID: ${{ secrets.AWS_SECRET_KEY_ID }} From 1bc3a97e95c9013a8ab86076437558a3428354d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 1 Dec 2022 12:41:22 -0800 Subject: [PATCH 5/7] Remove checks for torch greater than 1.10 (#15846) --- .../advanced/model_parallel.rst | 3 --- .../common/precision_intermediate.rst | 5 +---- docs/source-pytorch/conf.py | 1 - .../plugins/collectives/torch_collective.py | 7 ++----- .../plugins/precision/native_amp.py | 20 +++++------------- src/lightning_lite/utilities/imports.py | 2 -- .../callbacks/quantization.py | 20 +++++------------- src/pytorch_lightning/core/module.py | 11 ++-------- .../plugins/precision/native_amp.py | 21 +++++-------------- src/pytorch_lightning/strategies/ddp.py | 8 +++---- src/pytorch_lightning/utilities/__init__.py | 1 - src/pytorch_lightning/utilities/imports.py | 1 - tests/tests_lite/helpers/runif.py | 3 +-- .../plugins/precision/test_native_amp.py | 8 ------- tests/tests_lite/test_connector.py | 10 --------- tests/tests_pytorch/helpers/runif.py | 3 +-- .../tests_pytorch/plugins/test_amp_plugins.py | 11 ---------- .../strategies/test_ddp_strategy.py | 4 +--- .../test_ddp_strategy_with_comm_hook.py | 5 +---- 19 files changed, 28 insertions(+), 116 deletions(-) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index a9922f4274154e..ab96b5339b0188 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -1108,9 +1108,6 @@ Combine hooks for accumulated benefit: When using Post-localSGD, you must also pass ``model_averaging_period`` to allow for model parameter averaging: -.. note:: - Post-localSGD support requires PyTorch>=1.10.0 - .. code-block:: python from pytorch_lightning import Trainer diff --git a/docs/source-pytorch/common/precision_intermediate.rst b/docs/source-pytorch/common/precision_intermediate.rst index 1b08c88b6dc26f..0f149e93db4d83 100644 --- a/docs/source-pytorch/common/precision_intermediate.rst +++ b/docs/source-pytorch/common/precision_intermediate.rst @@ -115,8 +115,6 @@ BFloat16 Mixed Precision .. warning:: - BFloat16 requires PyTorch 1.10 or later and is only supported with PyTorch Native AMP. - BFloat16 is also experimental and may not provide significant speedups or memory improvements, offering better numerical stability. Do note for GPUs, the most significant benefits require `Ampere `__ based GPUs, such as A100s or 3090s. @@ -126,14 +124,13 @@ BFloat16 Mixed precision is similar to FP16 mixed precision, however, it maintai Under the hood, we use `torch.autocast `__ with the dtype set to ``bfloat16``, with no gradient scaling. .. testcode:: - :skipif: not _TORCH_GREATER_EQUAL_1_10 or not torch.cuda.is_available() + :skipif: not torch.cuda.is_available() Trainer(accelerator="gpu", devices=1, precision="bf16") It is also possible to use BFloat16 mixed precision on the CPU, relying on MKLDNN under the hood. .. testcode:: - :skipif: not _TORCH_GREATER_EQUAL_1_10 Trainer(precision="bf16") diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 3a26c951486a5c..5bb3eb4c1115fc 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -400,7 +400,6 @@ def package_list_from_file(file): from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _TORCHVISION_AVAILABLE, - _TORCH_GREATER_EQUAL_1_10, ) from pytorch_lightning.loggers.neptune import _NEPTUNE_AVAILABLE from pytorch_lightning.loggers.comet import _COMET_AVAILABLE diff --git a/src/lightning_lite/plugins/collectives/torch_collective.py b/src/lightning_lite/plugins/collectives/torch_collective.py index 4ec48cfdbc56b2..fc4282a28245b5 100644 --- a/src/lightning_lite/plugins/collectives/torch_collective.py +++ b/src/lightning_lite/plugins/collectives/torch_collective.py @@ -7,7 +7,7 @@ from typing_extensions import Self from lightning_lite.plugins.collectives.collective import Collective -from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_13 +from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_13 from lightning_lite.utilities.types import CollectibleGroup, RedOpType, ReduceOp if dist.is_available(): @@ -86,10 +86,7 @@ def all_gather_object(self, object_list: List[Any], obj: Any) -> List[Any]: def broadcast_object_list( self, object_list: List[Any], src: int, device: Optional[torch.device] = None ) -> List[Any]: - kwargs = {} - if _TORCH_GREATER_EQUAL_1_10: - kwargs["device"] = device - dist.broadcast_object_list(object_list, src, group=self.group, **kwargs) + dist.broadcast_object_list(object_list, src, group=self.group, device=device) return object_list def gather_object(self, obj: Any, object_gather_list: List[Any], dst: int = 0) -> List[Any]: diff --git a/src/lightning_lite/plugins/precision/native_amp.py b/src/lightning_lite/plugins/precision/native_amp.py index b09ac5647f89a6..5f9b477171c21d 100644 --- a/src/lightning_lite/plugins/precision/native_amp.py +++ b/src/lightning_lite/plugins/precision/native_amp.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Any, Dict, Generator, Optional, Union +from typing import Any, Dict, Generator, Optional import torch from torch import Tensor @@ -23,14 +23,8 @@ from lightning_lite.accelerators.cuda import _patch_cuda_is_available from lightning_lite.plugins.precision.precision import Precision from lightning_lite.plugins.precision.utils import _convert_fp_tensor -from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_10 from lightning_lite.utilities.types import Optimizable -if _TORCH_GREATER_EQUAL_1_10: - from torch import autocast as new_autocast -else: - from torch.cuda.amp import autocast as old_autocast - class NativeMixedPrecision(Precision): """Plugin for Native Mixed Precision (AMP) training with ``torch.autocast``. @@ -45,8 +39,6 @@ def __init__( self, precision: Literal[16, "bf16"], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None ) -> None: super().__init__() - if precision == "bf16" and not _TORCH_GREATER_EQUAL_1_10: - raise ImportError("To use bfloat16 with native amp you must install torch greater or equal to 1.10.") if scaler is None and precision == 16: with _patch_cuda_is_available(): # if possible, we defer CUDA initialization to support strategies that will attempt forks @@ -96,9 +88,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None: if self.scaler is not None: self.scaler.load_state_dict(state_dict) - def _autocast_context_manager(self) -> Union["old_autocast", "new_autocast"]: - if _TORCH_GREATER_EQUAL_1_10: - # the dtype could be automatically inferred but we need to manually set it due to a bug upstream - # https://github.com/pytorch/pytorch/issues/67233 - return new_autocast(self.device, dtype=torch.bfloat16 if self.precision == "bf16" else torch.half) - return old_autocast() + def _autocast_context_manager(self) -> torch.autocast: + # the dtype could be automatically inferred but we need to manually set it due to a bug upstream + # https://github.com/pytorch/pytorch/issues/67233 + return torch.autocast(self.device, dtype=torch.bfloat16 if self.precision == "bf16" else torch.half) diff --git a/src/lightning_lite/utilities/imports.py b/src/lightning_lite/utilities/imports.py index 83f3b76b1e9a43..d4cfe19ed99bb9 100644 --- a/src/lightning_lite/utilities/imports.py +++ b/src/lightning_lite/utilities/imports.py @@ -27,8 +27,6 @@ _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) -_TORCH_GREATER_EQUAL_1_10 = compare_version("torch", operator.ge, "1.10.0") -_TORCH_LESSER_EQUAL_1_10_2 = compare_version("torch", operator.le, "1.10.2") _TORCH_GREATER_EQUAL_1_11 = compare_version("torch", operator.ge, "1.11.0") _TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0") _TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0") diff --git a/src/pytorch_lightning/callbacks/quantization.py b/src/pytorch_lightning/callbacks/quantization.py index d89bed03941052..18d71f4153cfba 100644 --- a/src/pytorch_lightning/callbacks/quantization.py +++ b/src/pytorch_lightning/callbacks/quantization.py @@ -22,18 +22,14 @@ import torch from torch import Tensor +from torch.ao.quantization.qconfig import QConfig from torch.quantization import FakeQuantizeBase import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12 +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.exceptions import MisconfigurationException -if _TORCH_GREATER_EQUAL_1_10: - from torch.ao.quantization.qconfig import QConfig -else: - from torch.quantization import QConfig - if _TORCH_GREATER_EQUAL_1_11: from torch.ao.quantization import fuse_modules_qat as fuse_modules else: @@ -252,15 +248,9 @@ def _prepare_model(self, model: "pl.LightningModule") -> None: if self._observer_type == "histogram": model.qconfig = torch.quantization.get_default_qconfig(self._qconfig) elif self._observer_type == "average": - extra_kwargs: Dict[str, Optional[int]] = {} - if _TORCH_GREATER_EQUAL_1_12: - extra_kwargs["version"] = 0 - # version=None corresponds to using FakeQuantize rather than - # FusedMovingAvgObsFakeQuantize which was introduced in PT1.10 - # details in https://github.com/pytorch/pytorch/issues/64564 - elif _TORCH_GREATER_EQUAL_1_10: - extra_kwargs["version"] = None - model.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig, **extra_kwargs) + model.qconfig = torch.quantization.get_default_qat_qconfig( + self._qconfig, version=0 if _TORCH_GREATER_EQUAL_1_12 else None + ) elif isinstance(self._qconfig, QConfig): model.qconfig = self._qconfig # type: ignore [assignment] diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index f180439da74c6a..53043ad3090259 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -45,7 +45,7 @@ from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator -from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType +from pytorch_lightning.utilities import _IS_WINDOWS, GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_warn @@ -1824,13 +1824,6 @@ def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = Non input_sample = self._on_before_batch_transfer(input_sample) input_sample = self._apply_batch_transfer_handler(input_sample) - if not _TORCH_GREATER_EQUAL_1_10 and "example_outputs" not in kwargs: - self.eval() - if isinstance(input_sample, tuple): - kwargs["example_outputs"] = self(*input_sample) - else: - kwargs["example_outputs"] = self(input_sample) - torch.onnx.export(self, input_sample, file_path, **kwargs) self.train(mode) @@ -1938,7 +1931,7 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: These hooks ensure that ShardedTensors are included when saving, and are loaded the LightningModule correctly. """ - if not _TORCH_GREATER_EQUAL_1_10 or _IS_WINDOWS or not torch.distributed.is_available(): + if _IS_WINDOWS or not torch.distributed.is_available(): rank_zero_debug("Could not register sharded tensor state dict hooks") return diff --git a/src/pytorch_lightning/plugins/precision/native_amp.py b/src/pytorch_lightning/plugins/precision/native_amp.py index fe2c9e8a134a05..552562dedc8de0 100644 --- a/src/pytorch_lightning/plugins/precision/native_amp.py +++ b/src/pytorch_lightning/plugins/precision/native_amp.py @@ -22,14 +22,9 @@ from lightning_lite.accelerators.cuda import _patch_cuda_is_available from lightning_lite.utilities.types import Optimizable from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, AMPType, GradClipAlgorithmType +from pytorch_lightning.utilities import AMPType, GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException -if _TORCH_GREATER_EQUAL_1_10: - from torch import autocast as new_autocast -else: - from torch.cuda.amp import autocast as old_autocast - class NativeMixedPrecisionPlugin(PrecisionPlugin): """Plugin for Native Mixed Precision (AMP) training with ``torch.autocast``. @@ -46,10 +41,6 @@ def __init__( self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None ) -> None: super().__init__() - if precision == "bf16" and not _TORCH_GREATER_EQUAL_1_10: - raise MisconfigurationException( - "To use bfloat16 with native amp you must install torch greater or equal to 1.10." - ) if scaler is None and precision == 16: with _patch_cuda_is_available(): # if possible, we defer CUDA initialization to support strategies that will attempt forks @@ -113,12 +104,10 @@ def clip_gradients( ) super().clip_gradients(optimizer=optimizer, clip_val=clip_val, gradient_clip_algorithm=gradient_clip_algorithm) - def autocast_context_manager(self) -> Union["old_autocast", "new_autocast"]: - if _TORCH_GREATER_EQUAL_1_10: - # the dtype could be automatically inferred but we need to manually set it due to a bug upstream - # https://github.com/pytorch/pytorch/issues/67233 - return new_autocast(self.device, dtype=torch.bfloat16 if self.precision == "bf16" else torch.half) - return old_autocast() + def autocast_context_manager(self) -> torch.autocast: + # the dtype could be automatically inferred but we need to manually set it due to a bug upstream + # https://github.com/pytorch/pytorch/issues/67233 + return torch.autocast(self.device, dtype=torch.bfloat16 if self.precision == "bf16" else torch.half) @contextmanager def forward_context(self) -> Generator[None, None, None]: diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 29a089a577a3f1..8f377c40f63df9 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -53,7 +53,7 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import register_ddp_comm_hook from pytorch_lightning.utilities.exceptions import DeadlockDetectedException -from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11 +from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep @@ -61,7 +61,7 @@ from fairscale.optim import OSS else: OSS = object -if _TORCH_GREATER_EQUAL_1_10 and torch.distributed.is_available(): +if torch.distributed.is_available(): from torch.distributed.algorithms.model_averaging.averagers import ModelAverager log = logging.getLogger(__name__) @@ -181,7 +181,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self.setup_optimizers(trainer) _optimizers_to_device(self.optimizers, self.root_device) - if _TORCH_GREATER_EQUAL_1_10 and trainer_fn == TrainerFn.FITTING: + if trainer_fn == TrainerFn.FITTING: import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState): @@ -279,7 +279,7 @@ def optimizer_step( """ optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) - if not _TORCH_GREATER_EQUAL_1_10 or self._model_averager is None: + if self._model_averager is None: return optimizer_output params = [param for group in optimizer.param_groups for param in group["params"] if param.grad is not None] diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index fc603066926169..8c026643d93b1a 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -29,7 +29,6 @@ _IS_WINDOWS, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, - _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12, _TORCH_QUANTIZE_AVAILABLE, diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 803b335fa2f2df..f2efdfcb82fcf8 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -22,7 +22,6 @@ _IS_WINDOWS = platform.system() == "Windows" _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) -_TORCH_GREATER_EQUAL_1_10 = compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = compare_version("torch", operator.le, "1.10.2") _TORCH_GREATER_EQUAL_1_11 = compare_version("torch", operator.ge, "1.11.0") _TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0") diff --git a/tests/tests_lite/helpers/runif.py b/tests/tests_lite/helpers/runif.py index 3572fb107979d6..f4da399c393583 100644 --- a/tests/tests_lite/helpers/runif.py +++ b/tests/tests_lite/helpers/runif.py @@ -25,7 +25,6 @@ from lightning_lite.accelerators.mps import MPSAccelerator from lightning_lite.strategies.deepspeed import _DEEPSPEED_AVAILABLE from lightning_lite.strategies.fairscale import _FAIRSCALE_AVAILABLE -from lightning_lite.utilities.imports import _TORCH_GREATER_EQUAL_1_10 class RunIf: @@ -97,7 +96,7 @@ def __new__( if bf16_cuda: try: - cond = not (torch.cuda.is_available() and _TORCH_GREATER_EQUAL_1_10 and torch.cuda.is_bf16_supported()) + cond = not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) except (AssertionError, RuntimeError) as e: # AssertionError: Torch not compiled with CUDA enabled # RuntimeError: Found no NVIDIA driver on your system. diff --git a/tests/tests_lite/plugins/precision/test_native_amp.py b/tests/tests_lite/plugins/precision/test_native_amp.py index 69f3f758a59a2b..5d431df93e83fc 100644 --- a/tests/tests_lite/plugins/precision/test_native_amp.py +++ b/tests/tests_lite/plugins/precision/test_native_amp.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock from unittest.mock import Mock import pytest @@ -25,7 +24,6 @@ def test_native_amp_precision_default_scaler(): assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) -@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", True) def test_native_amp_precision_scaler_with_bf16(): with pytest.raises(ValueError, match="`precision='bf16'` does not use a scaler"): NativeMixedPrecision(precision="bf16", device=Mock(), scaler=Mock()) @@ -34,12 +32,6 @@ def test_native_amp_precision_scaler_with_bf16(): assert precision.scaler is None -@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", False) -def test_native_amp_precision_bf16_min_torch(): - with pytest.raises(ImportError, match="you must install torch greater or equal to 1.10"): - NativeMixedPrecision(precision="bf16", device=Mock()) - - def test_native_amp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16 on CPU and CUDA.""" precision = NativeMixedPrecision(precision=16, device="cuda") diff --git a/tests/tests_lite/test_connector.py b/tests/tests_lite/test_connector.py index 072ecdfe99af3d..e6e7b61f841963 100644 --- a/tests/tests_lite/test_connector.py +++ b/tests/tests_lite/test_connector.py @@ -763,7 +763,6 @@ def test_ddp_fork_on_unsupported_platform(_, strategy): _Connector(strategy=strategy) -@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", True) def test_precision_selection_16_on_cpu_warns(): with pytest.warns( UserWarning, match=r"precision=16\)` but native AMP is not supported on CPU. Using `precision='bf16" @@ -771,14 +770,6 @@ def test_precision_selection_16_on_cpu_warns(): _Connector(precision=16) -@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", False) -def test_precision_selection_16_raises_torch_version(monkeypatch): - with pytest.raises(ImportError, match="must install torch greater or equal to 1.10"): - _Connector(accelerator="cpu", precision=16) - with pytest.raises(ImportError, match="must install torch greater or equal to 1.10"): - _Connector(accelerator="cpu", precision="bf16") - - class MyNativeAMP(NativeMixedPrecision): pass @@ -789,7 +780,6 @@ class MyNativeAMP(NativeMixedPrecision): "is_custom_plugin,plugin_cls", [(False, NativeMixedPrecision), (True, MyNativeAMP)], ) -@mock.patch("lightning_lite.plugins.precision.native_amp._TORCH_GREATER_EQUAL_1_10", True) def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin_cls): plugin = None if is_custom_plugin: diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index 5323461d52fdd8..ac3b45c0f8d55d 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -36,7 +36,6 @@ _IPU_AVAILABLE, _OMEGACONF_AVAILABLE, _PSUTIL_AVAILABLE, - _TORCH_GREATER_EQUAL_1_10, _TORCH_QUANTIZE_AVAILABLE, ) from tests_pytorch.helpers.datamodules import _SKLEARN_AVAILABLE @@ -162,7 +161,7 @@ def __new__( if bf16_cuda: try: - cond = not (torch.cuda.is_available() and _TORCH_GREATER_EQUAL_1_10 and torch.cuda.is_bf16_supported()) + cond = not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) except (AssertionError, RuntimeError) as e: # AssertionError: Torch not compiled with CUDA enabled # RuntimeError: Found no NVIDIA driver on your system. diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index acc7286d28966a..65a764a0931275 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -258,17 +258,6 @@ def test_precision_selection_raises(monkeypatch): ): Trainer(amp_backend="apex", precision=16) - import pytorch_lightning.plugins.precision.native_amp as amp - - monkeypatch.setattr(amp, "_TORCH_GREATER_EQUAL_1_10", False) - with pytest.warns( - UserWarning, match=r"precision=16\)` but native AMP is not supported on CPU. Using `precision='bf16" - ), pytest.raises(MisconfigurationException, match="must install torch greater or equal to 1.10"): - Trainer(precision=16) - - with pytest.raises(MisconfigurationException, match="must install torch greater or equal to 1.10"): - Trainer(precision="bf16") - with pytest.raises(MisconfigurationException, match=r"amp_type='apex', precision='bf16'\)` but it's not supported"): Trainer(amp_backend="apex", precision="bf16") diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy.py b/tests/tests_pytorch/strategies/test_ddp_strategy.py index 58768b2b6ce1e0..ee6e67df29f096 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy.py @@ -17,6 +17,7 @@ import pytest import torch +from torch.distributed.optim import ZeroRedundancyOptimizer from torch.nn.parallel import DistributedDataParallel from lightning_lite.plugins.environments import ClusterEnvironment, LightningEnvironment @@ -25,13 +26,10 @@ from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 from tests_pytorch.helpers.runif import RunIf if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS -if _TORCH_GREATER_EQUAL_1_10: - from torch.distributed.optim import ZeroRedundancyOptimizer class BoringModelGPU(BoringModel): diff --git a/tests/tests_pytorch/strategies/test_ddp_strategy_with_comm_hook.py b/tests/tests_pytorch/strategies/test_ddp_strategy_with_comm_hook.py index f89a1d1a6ce8da..0e7e7c3bf36e78 100644 --- a/tests/tests_pytorch/strategies/test_ddp_strategy_with_comm_hook.py +++ b/tests/tests_pytorch/strategies/test_ddp_strategy_with_comm_hook.py @@ -19,16 +19,13 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPSpawnStrategy, DDPStrategy -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10 from tests_pytorch.helpers.runif import RunIf if torch.distributed.is_available(): + import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD from torch.distributed.algorithms.ddp_comm_hooks import default_hooks as default from torch.distributed.algorithms.ddp_comm_hooks import powerSGD_hook as powerSGD - if _TORCH_GREATER_EQUAL_1_10: - import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD - class TestDDPStrategy(DDPStrategy): def __init__(self, expected_ddp_comm_hook_name, *args, **kwargs): From 993bd67f963651b5b02c42c676a343cc99577160 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 1 Dec 2022 23:14:18 +0100 Subject: [PATCH 6/7] CI: prune dependency for benchmarks (#15879) * prune dependency for benchmarks * drop --- .azure/gpu-benchmark.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index bcd925e1d267ee..26b4a544e02e93 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -68,14 +68,7 @@ jobs: pip list displayName: 'Image info & NVIDIA' - - bash: | - python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt] - - PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} - displayName: 'Adjust dependencies' - - - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL} + - bash: pip install -e .[dev] --find-links ${TORCH_URL} env: PACKAGE_NAME: "pytorch" FREEZE_REQUIREMENTS: "1" From fee52f931fef079f202e1ee1f29e83f7808e6086 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 2 Dec 2022 07:50:51 +0100 Subject: [PATCH 7/7] unblock legacy checkpoints (#15798) * fixing legacy checkpoints * Apply suggestions from code review Co-authored-by: Akihiro Nitta --- tests/legacy/simple_classif_training.py | 4 ++-- .../checkpointing/test_legacy_checkpoints.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/legacy/simple_classif_training.py b/tests/legacy/simple_classif_training.py index ab7b1fab9f7c7b..960eea34eedf01 100644 --- a/tests/legacy/simple_classif_training.py +++ b/tests/legacy/simple_classif_training.py @@ -42,8 +42,8 @@ def main_train(dir_path, max_epochs: int = 20): model = ClassificationModel() trainer.fit(model, datamodule=dm) res = trainer.test(model, datamodule=dm) - assert res[0]["test_loss"] <= 0.7 - assert res[0]["test_acc"] >= 0.85 + assert res[0]["test_loss"] <= 0.85, str(res[0]["test_loss"]) + assert res[0]["test_acc"] >= 0.7, str(res[0]["test_acc"]) assert trainer.current_epoch < (max_epochs - 1) diff --git a/tests/tests_pytorch/checkpointing/test_legacy_checkpoints.py b/tests/tests_pytorch/checkpointing/test_legacy_checkpoints.py index 4a99accb069977..1100ac8fcde1ba 100644 --- a/tests/tests_pytorch/checkpointing/test_legacy_checkpoints.py +++ b/tests/tests_pytorch/checkpointing/test_legacy_checkpoints.py @@ -47,8 +47,8 @@ def test_load_legacy_checkpoints(tmpdir, pl_version: str): trainer = Trainer(default_root_dir=str(tmpdir)) dm = ClassifDataModule(num_features=24, length=6000, batch_size=128, n_clusters_per_class=2, n_informative=8) res = trainer.test(model, datamodule=dm) - assert res[0]["test_loss"] <= 0.7 - assert res[0]["test_acc"] >= 0.85 + assert res[0]["test_loss"] <= 0.85, str(res[0]["test_loss"]) + assert res[0]["test_acc"] >= 0.7, str(res[0]["test_acc"]) print(res) @@ -111,5 +111,5 @@ def test_resume_legacy_checkpoints(tmpdir, pl_version: str): torch.backends.cudnn.deterministic = True trainer.fit(model, datamodule=dm, ckpt_path=path_ckpt) res = trainer.test(model, datamodule=dm) - assert res[0]["test_loss"] <= 0.7 - assert res[0]["test_acc"] >= 0.85 + assert res[0]["test_loss"] <= 0.85, str(res[0]["test_loss"]) + assert res[0]["test_acc"] >= 0.7, str(res[0]["test_acc"])