diff --git a/CHANGELOG.md b/CHANGELOG.md index eeace95653ee0..9d213b5a8449e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,9 +90,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) -- - Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) + +- Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/PyTorchLightning/pytorch-lightning/pull/12554)) + + ### Fixed - Avoid calling `average_parameters` multiple times per optimizer step ([#12452](https://github.com/PyTorchLightning/pytorch-lightning/pull/12452)) diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst index 23a43990fed39..aa0ad662223b0 100644 --- a/docs/source/api_references.rst +++ b/docs/source/api_references.rst @@ -85,7 +85,6 @@ Callbacks API Callback DeviceStatsMonitor EarlyStopping - GPUStatsMonitor GradientAccumulationScheduler LambdaCallback LearningRateMonitor diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst index 19b43f4f4d127..9f1765fe63b8f 100644 --- a/docs/source/extensions/callbacks.rst +++ b/docs/source/extensions/callbacks.rst @@ -92,7 +92,6 @@ Lightning has a few built-in callbacks. Callback DeviceStatsMonitor EarlyStopping - GPUStatsMonitor GradientAccumulationScheduler LambdaCallback LearningRateMonitor diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py index f47bc115ece51..808ee0208400d 100644 --- a/pytorch_lightning/callbacks/__init__.py +++ b/pytorch_lightning/callbacks/__init__.py @@ -15,7 +15,6 @@ from pytorch_lightning.callbacks.device_stats_monitor import DeviceStatsMonitor from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.finetuning import BackboneFinetuning, BaseFinetuning -from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler from pytorch_lightning.callbacks.lambda_function import LambdaCallback from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor @@ -36,7 +35,6 @@ "Callback", "DeviceStatsMonitor", "EarlyStopping", - "GPUStatsMonitor", "XLAStatsMonitor", "GradientAccumulationScheduler", "LambdaCallback", diff --git a/pytorch_lightning/callbacks/gpu_stats_monitor.py b/pytorch_lightning/callbacks/gpu_stats_monitor.py deleted file mode 100644 index 36b4006e37a2a..0000000000000 --- a/pytorch_lightning/callbacks/gpu_stats_monitor.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -GPU Stats Monitor -================= - -Monitor and logs GPU stats during training. - -""" - -import os -import shutil -import subprocess -import time -from typing import Any, Dict, List, Optional, Tuple - -import torch - -import pytorch_lightning as pl -from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.parsing import AttributeDict -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_only -from pytorch_lightning.utilities.types import STEP_OUTPUT - - -class GPUStatsMonitor(Callback): - r""" - .. deprecated:: v1.5 - The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7. - Please use the `DeviceStatsMonitor` callback instead. - - Automatically monitors and logs GPU stats during training stage. ``GPUStatsMonitor`` - is a callback and in order to use it you need to assign a logger in the ``Trainer``. - - Args: - memory_utilization: Set to ``True`` to monitor used, free and percentage of memory - utilization at the start and end of each step. Default: ``True``. - gpu_utilization: Set to ``True`` to monitor percentage of GPU utilization - at the start and end of each step. Default: ``True``. - intra_step_time: Set to ``True`` to monitor the time of each step. Default: ``False``. - inter_step_time: Set to ``True`` to monitor the time between the end of one step - and the start of the next step. Default: ``False``. - fan_speed: Set to ``True`` to monitor percentage of fan speed. Default: ``False``. - temperature: Set to ``True`` to monitor the memory and gpu temperature in degree Celsius. - Default: ``False``. - - Raises: - MisconfigurationException: - If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger. - - Example:: - - >>> from pytorch_lightning import Trainer - >>> from pytorch_lightning.callbacks import GPUStatsMonitor - >>> gpu_stats = GPUStatsMonitor() # doctest: +SKIP - >>> trainer = Trainer(callbacks=[gpu_stats]) # doctest: +SKIP - - GPU stats are mainly based on `nvidia-smi --query-gpu` command. The description of the queries is as follows: - - - **fan.speed** – The fan speed value is the percent of maximum speed that the device's fan is currently - intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed. - If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. - Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. - - **memory.used** – Total memory allocated by active contexts. - - **memory.free** – Total free memory. - - **utilization.gpu** – Percent of time over the past sample period during which one or more kernels was - executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. - - **utilization.memory** – Percent of time over the past sample period during which global (device) memory was - being read or written. The sample period may be between 1 second and 1/6 second depending on the product. - - **temperature.gpu** – Core GPU temperature, in degrees C. - - **temperature.memory** – HBM memory temperature, in degrees C. - - """ - - def __init__( - self, - memory_utilization: bool = True, - gpu_utilization: bool = True, - intra_step_time: bool = False, - inter_step_time: bool = False, - fan_speed: bool = False, - temperature: bool = False, - ): - super().__init__() - - rank_zero_deprecation( - "The `GPUStatsMonitor` callback was deprecated in v1.5 and will be removed in v1.7." - " Please use the `DeviceStatsMonitor` callback instead." - ) - - if shutil.which("nvidia-smi") is None: - raise MisconfigurationException( - "Cannot use GPUStatsMonitor callback because NVIDIA driver is not installed." - ) - - self._log_stats = AttributeDict( - { - "memory_utilization": memory_utilization, - "gpu_utilization": gpu_utilization, - "intra_step_time": intra_step_time, - "inter_step_time": inter_step_time, - "fan_speed": fan_speed, - "temperature": temperature, - } - ) - - # The logical device IDs for selected devices - self._device_ids: List[int] = [] # will be assigned later in setup() - - # The unmasked real GPU IDs - self._gpu_ids: List[str] = [] # will be assigned later in setup() - - def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: - if not trainer.loggers: - raise MisconfigurationException("Cannot use GPUStatsMonitor callback with Trainer that has no logger.") - - if trainer.strategy.root_device.type != "cuda": - raise MisconfigurationException( - "You are using GPUStatsMonitor but are not running on GPU." - f" The root device type is {trainer.strategy.root_device.type}." - ) - - # The logical device IDs for selected devices - self._device_ids = sorted(set(trainer.device_ids)) - - # The unmasked real GPU IDs - self._gpu_ids = self._get_gpu_ids(self._device_ids) - - def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._snap_intra_step_time: Optional[float] = None - self._snap_inter_step_time: Optional[float] = None - - @rank_zero_only - def on_train_batch_start( - self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int - ) -> None: - if self._log_stats.intra_step_time: - self._snap_intra_step_time = time.time() - - if not trainer._logger_connector.should_update_logs: - return - - gpu_stat_keys = self._get_gpu_stat_keys() - gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys]) - logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys) - - if self._log_stats.inter_step_time and self._snap_inter_step_time: - # First log at beginning of second step - logs["batch_time/inter_step (ms)"] = (time.time() - self._snap_inter_step_time) * 1000 - - for logger in trainer.loggers: - logger.log_metrics(logs, step=trainer.fit_loop.epoch_loop._batches_that_stepped) - - @rank_zero_only - def on_train_batch_end( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - outputs: STEP_OUTPUT, - batch: Any, - batch_idx: int, - ) -> None: - if self._log_stats.inter_step_time: - self._snap_inter_step_time = time.time() - - if not trainer._logger_connector.should_update_logs: - return - - gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys() - gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys]) - logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys) - - if self._log_stats.intra_step_time and self._snap_intra_step_time: - logs["batch_time/intra_step (ms)"] = (time.time() - self._snap_intra_step_time) * 1000 - - for logger in trainer.loggers: - logger.log_metrics(logs, step=trainer.fit_loop.epoch_loop._batches_that_stepped) - - @staticmethod - def _get_gpu_ids(device_ids: List[int]) -> List[str]: - """Get the unmasked real GPU IDs.""" - # All devices if `CUDA_VISIBLE_DEVICES` unset - default = ",".join(str(i) for i in range(torch.cuda.device_count())) - cuda_visible_devices: List[str] = os.getenv("CUDA_VISIBLE_DEVICES", default=default).split(",") - return [cuda_visible_devices[device_id].strip() for device_id in device_ids] - - def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]: - if not queries: - return [] - - """Run nvidia-smi to get the gpu stats""" - gpu_query = ",".join(queries) - format = "csv,nounits,noheader" - gpu_ids = ",".join(self._gpu_ids) - result = subprocess.run( - [ - # it's ok to suppress the warning here since we ensure nvidia-smi exists during init - shutil.which("nvidia-smi"), # type: ignore - f"--query-gpu={gpu_query}", - f"--format={format}", - f"--id={gpu_ids}", - ], - encoding="utf-8", - capture_output=True, - check=True, - ) - - def _to_float(x: str) -> float: - try: - return float(x) - except ValueError: - return 0.0 - - stats = [[_to_float(x) for x in s.split(", ")] for s in result.stdout.strip().split(os.linesep)] - return stats - - @staticmethod - def _parse_gpu_stats( - device_ids: List[int], stats: List[List[float]], keys: List[Tuple[str, str]] - ) -> Dict[str, float]: - """Parse the gpu stats into a loggable dict.""" - logs = {} - for i, device_id in enumerate(device_ids): - for j, (x, unit) in enumerate(keys): - logs[f"device_id: {device_id}/{x} ({unit})"] = stats[i][j] - return logs - - def _get_gpu_stat_keys(self) -> List[Tuple[str, str]]: - """Get the GPU stats keys.""" - stat_keys = [] - - if self._log_stats.gpu_utilization: - stat_keys.append(("utilization.gpu", "%")) - - if self._log_stats.memory_utilization: - stat_keys.extend([("memory.used", "MB"), ("memory.free", "MB"), ("utilization.memory", "%")]) - - return stat_keys - - def _get_gpu_device_stat_keys(self) -> List[Tuple[str, str]]: - """Get the device stats keys.""" - stat_keys = [] - - if self._log_stats.fan_speed: - stat_keys.append(("fan.speed", "%")) - - if self._log_stats.temperature: - stat_keys.extend([("temperature.gpu", "°C"), ("temperature.memory", "°C")]) - - return stat_keys diff --git a/tests/callbacks/test_gpu_stats_monitor.py b/tests/callbacks/test_gpu_stats_monitor.py deleted file mode 100644 index 1cc4d51d90eba..0000000000000 --- a/tests/callbacks/test_gpu_stats_monitor.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -from unittest import mock - -import numpy as np -import pytest -import torch - -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import GPUStatsMonitor -from pytorch_lightning.loggers import CSVLogger -from pytorch_lightning.loggers.csv_logs import ExperimentWriter -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.helpers import BoringModel -from tests.helpers.runif import RunIf - - -@RunIf(min_gpus=1) -def test_gpu_stats_monitor(tmpdir): - """Test GPU stats are logged using a logger.""" - model = BoringModel() - with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): - gpu_stats = GPUStatsMonitor(intra_step_time=True) - logger = CSVLogger(tmpdir) - log_every_n_steps = 2 - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=2, - limit_train_batches=7, - log_every_n_steps=log_every_n_steps, - accelerator="gpu", - devices=1, - callbacks=[gpu_stats], - logger=logger, - ) - - trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" - - path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) - met_data = np.genfromtxt(path_csv, delimiter=",", names=True, deletechars="", replace_space=" ") - - batch_time_data = met_data["batch_time/intra_step (ms)"] - batch_time_data = batch_time_data[~np.isnan(batch_time_data)] - assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps - - fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"] - - for f in fields: - assert any(f in h for h in met_data.dtype.names) - - -@RunIf(min_gpus=1) -def test_gpu_stats_monitor_no_queries(tmpdir): - """Test GPU logger doesn't fail if no "nvidia-smi" queries are to be performed.""" - model = BoringModel() - with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): - gpu_stats = GPUStatsMonitor( - memory_utilization=False, - gpu_utilization=False, - intra_step_time=True, - inter_step_time=True, - ) - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=0, - log_every_n_steps=1, - accelerator="gpu", - devices=1, - callbacks=[gpu_stats], - ) - with mock.patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") as log_metrics_mock: - trainer.fit(model) - - assert log_metrics_mock.mock_calls[1:] == [ - mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=0), - mock.call({"batch_time/inter_step (ms)": mock.ANY}, step=1), - mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=1), - ] - - -@pytest.mark.skipif(torch.cuda.is_available(), reason="test requires CPU machine") -def test_gpu_stats_monitor_cpu_machine(tmpdir): - """Test GPUStatsMonitor on CPU machine.""" - with pytest.raises(MisconfigurationException, match="NVIDIA driver is not installed"), pytest.deprecated_call( - match="GPUStatsMonitor` callback was deprecated in v1.5" - ): - GPUStatsMonitor() - - -@RunIf(min_gpus=1) -def test_gpu_stats_monitor_no_logger(tmpdir): - """Test GPUStatsMonitor with no logger in Trainer.""" - model = BoringModel() - with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): - gpu_stats = GPUStatsMonitor() - - trainer = Trainer( - default_root_dir=tmpdir, callbacks=[gpu_stats], max_epochs=1, accelerator="gpu", devices=1, logger=False - ) - - with pytest.raises(MisconfigurationException, match="Trainer that has no logger."): - trainer.fit(model) - - -@RunIf(min_gpus=1) -def test_gpu_stats_monitor_no_gpu_warning(tmpdir): - """Test GPUStatsMonitor raises a warning when not training on GPU device.""" - model = BoringModel() - with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"): - gpu_stats = GPUStatsMonitor() - - trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_steps=1, gpus=None) - - with pytest.raises(MisconfigurationException, match="not running on GPU"): - trainer.fit(model) - - -def test_gpu_stats_monitor_parse_gpu_stats(): - logs = GPUStatsMonitor._parse_gpu_stats([1, 2], [[3, 4, 5], [6, 7]], [("gpu", "a"), ("memory", "b")]) - expected = { - "device_id: 1/gpu (a)": 3, - "device_id: 1/memory (b)": 4, - "device_id: 2/gpu (a)": 6, - "device_id: 2/memory (b)": 7, - } - assert logs == expected - - -@mock.patch.dict(os.environ, {}, clear=True) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=2) -def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_unset(device_count_mock, is_available_mock): - gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 0]) - expected = ["1", "0"] - assert gpu_ids == expected - - -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "3,2,4"}) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=3) -def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_integers(device_count_mock, is_available_mock): - gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2]) - expected = ["2", "4"] - assert gpu_ids == expected - - -@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "GPU-01a23b4c,GPU-56d78e9f,GPU-02a46c8e"}) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=3) -def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_uuids(device_count_mock, is_available_mock): - gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2]) - expected = ["GPU-56d78e9f", "GPU-02a46c8e"] - assert gpu_ids == expected diff --git a/tests/deprecated_api/test_remove_1-7.py b/tests/deprecated_api/test_remove_1-7.py index 62842317d3e57..6f7a23f58c142 100644 --- a/tests/deprecated_api/test_remove_1-7.py +++ b/tests/deprecated_api/test_remove_1-7.py @@ -22,7 +22,6 @@ import pytorch_lightning from pytorch_lightning import Callback, LightningDataModule, Trainer -from pytorch_lightning.callbacks.gpu_stats_monitor import GPUStatsMonitor from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor from pytorch_lightning.callbacks.progress import ProgressBar from pytorch_lightning.callbacks.xla_stats_monitor import XLAStatsMonitor @@ -40,7 +39,6 @@ from tests.deprecated_api import _soft_unimport_module from tests.helpers import BoringModel from tests.helpers.datamodules import MNISTDataModule -from tests.helpers.runif import RunIf from tests.loggers.test_logger import CustomLogger from tests.plugins.environments.test_lsf_environment import _make_rankfile @@ -329,12 +327,6 @@ def test_v1_7_0_deprecated_slurm_job_id(): trainer.slurm_job_id -@RunIf(min_gpus=1) -def test_v1_7_0_deprecate_gpu_stats_monitor(tmpdir): - with pytest.deprecated_call(match="The `GPUStatsMonitor` callback was deprecated in v1.5"): - _ = GPUStatsMonitor() - - def test_v1_7_0_deprecate_xla_stats_monitor(monkeypatch): monkeypatch.setattr(pytorch_lightning.callbacks.xla_stats_monitor, "_TPU_AVAILABLE", True) with pytest.deprecated_call(match="The `XLAStatsMonitor` callback was deprecated in v1.5"):