diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f8899ac4f8b8..f71623d574fa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504)) +- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) + + - ### Deprecated @@ -179,6 +182,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/PyTorchLightning/pytorch-lightning/pull/12965)) +- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/PyTorchLightning/pytorch-lightning/pull/12889)) + + +- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) + + ## [1.6.2] - 2022-04-27 ### Fixed @@ -189,9 +198,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed support for `ModelCheckpoint` monitors with dots ([#12783](https://github.com/PyTorchLightning/pytorch-lightning/pull/12783)) -- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/PyTorchLightning/pytorch-lightning/pull/12889)) - - ## [1.6.1] - 2022-04-13 ### Changed diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py index 233d8326727ab..85a37b332efed 100644 --- a/pytorch_lightning/trainer/connectors/data_connector.py +++ b/pytorch_lightning/trainer/connectors/data_connector.py @@ -381,13 +381,17 @@ def _reset_eval_dataloader( loader_num_batches = [] # determine number of batches - # datasets could be none, 1 or 2+ module = model or self.trainer.lightning_module or self.datamodule if len(dataloaders) != 0: for i, dataloader in enumerate(dataloaders): orig_num_batches = num_batches = ( len(dataloader) if has_len_all_ranks(dataloader, self.trainer.strategy, module) else float("inf") ) + + if orig_num_batches == 0: + loader_num_batches.append(orig_num_batches) + continue + self._worker_check(dataloader, f"{mode.dataloader_prefix}_dataloader {i}") # percent or num_steps @@ -395,23 +399,27 @@ def _reset_eval_dataloader( # limit num batches either as a percent or num steps if isinstance(limit_eval_batches, int): - num_batches = min(num_batches, int(limit_eval_batches)) - elif num_batches != float("inf"): - num_batches = int(num_batches * limit_eval_batches) + num_batches = min(orig_num_batches, limit_eval_batches) + elif isinstance(limit_eval_batches, float) and orig_num_batches != float("inf"): + num_batches = int(orig_num_batches * limit_eval_batches) elif limit_eval_batches != 1.0: raise MisconfigurationException( - f"When using an IterableDataset for `limit_{mode}_batches`," - f" `Trainer(limit_{mode.dataloader_prefix}_batches)` must be `1.0` or an int. An int k" - f" specifies `num_{mode.dataloader_prefix}_batches` to use." + f"When using an `IterableDataset`, `Trainer(limit_{mode.dataloader_prefix}_batches)` must be" + f" `1.0` or an int. An int specifies `num_{mode.dataloader_prefix}_batches` to use." ) - if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): - min_pct = 1.0 / len(dataloader) + if ( + num_batches == 0 + and limit_eval_batches > 0.0 + and isinstance(limit_eval_batches, float) + and orig_num_batches != float("inf") + ): + min_percentage = 1.0 / orig_num_batches raise MisconfigurationException( f"You requested to check {limit_eval_batches} of the `{mode.dataloader_prefix}_dataloader` but" f" {limit_eval_batches} * {orig_num_batches} < 1. Please increase the" - f" `limit_{mode.dataloader_prefix}_batches` flag. Try at least" - f" `limit_{mode.dataloader_prefix}_batches={min_pct}`" + f" `limit_{mode.dataloader_prefix}_batches` argument. Try at least" + f" `limit_{mode.dataloader_prefix}_batches={min_percentage}`" ) loader_num_batches.append(num_batches) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 384e1f43b33e2..37a5c801b23e3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1814,21 +1814,25 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) - self.train_dataloader = CombinedLoader(loaders, self._data_connector.multiple_trainloader_mode) module = model or self.lightning_module or self.datamodule - self.num_training_batches = ( + orig_train_batches = self.num_training_batches = ( len(self.train_dataloader) if has_len_all_ranks(self.train_dataloader, self.strategy, module) else float("inf") ) + if orig_train_batches == 0: + return + + # store epoch of dataloader reset for reload_dataloaders_every_n_epochs + self._last_train_dl_reload_epoch = self.current_epoch if isinstance(self.limit_train_batches, int): - self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) + self.num_training_batches = min(orig_train_batches, self.limit_train_batches) elif self.num_training_batches != float("inf"): - self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) + self.num_training_batches = int(orig_train_batches * self.limit_train_batches) elif self.limit_train_batches != 1.0: raise MisconfigurationException( - "When using an IterableDataset for `limit_train_batches`," - " `Trainer(limit_train_batches)` must be `1.0` or an int. An int k specifies" - " `num_training_batches` to use." + "When using an `IterableDataset`, `Trainer(limit_train_batches)` must be `1.0` or an int." + "An int specifies `num_training_batches` to use." ) if isinstance(self.val_check_interval, int): @@ -1862,8 +1866,19 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) - category=PossibleUserWarning, ) - # store epoch of dataloader reset for reload_dataloaders_every_n_epochs - self._last_train_dl_reload_epoch = self.current_epoch + if ( + self.num_training_batches == 0 + and self.limit_train_batches > 0.0 + and isinstance(self.limit_train_batches, float) + and orig_train_batches != float("inf") + ): + min_percentage = 1.0 / orig_train_batches + raise MisconfigurationException( + f"You requested to check {self.limit_train_batches} of the `train_dataloader` but" + f" {self.limit_train_batches} * {orig_train_batches} < 1. Please increase the" + f" `limit_train_batches` argument. Try at least" + f" `limit_train_batches={min_percentage}`" + ) def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) -> None: """Resets the validation dataloader and determines the number of batches. diff --git a/tests/trainer/connectors/test_data_connector.py b/tests/trainer/connectors/test_data_connector.py index c335644048db2..7d1b6f0a23bb4 100644 --- a/tests/trainer/connectors/test_data_connector.py +++ b/tests/trainer/connectors/test_data_connector.py @@ -539,3 +539,17 @@ def val_dataloader(self): trainer._data_connector.attach_data(model) trainer.reset_val_dataloader(model) assert trainer.val_dataloaders[0].sampler.shuffle == shuffle + + +def test_error_raised_with_insufficient_float_limit_train_dataloader(): + batch_size = 16 + dl = DataLoader(RandomDataset(32, batch_size * 9), batch_size=batch_size) + trainer = Trainer(limit_train_batches=0.1) + model = BoringModel() + + trainer._data_connector.attach_data(model=model, train_dataloaders=dl) + with pytest.raises( + MisconfigurationException, + match="Please increase the `limit_train_batches` argument. Try at least", + ): + trainer.reset_train_dataloader(model) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 66b5be243bf80..651ee5786a7be 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -519,13 +519,18 @@ def test_mixing_of_dataloader_options(tmpdir, ckpt_path): def test_warning_on_zero_len_dataloader(tmpdir): """Test that a warning is raised if a zero-length dataloader is defined.""" model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - fast_dev_run=1, - ) - dataloader = DataLoader(RandomDataset(32, 0)) - with pytest.warns(UserWarning, match="returned 0 length"): - trainer.fit(model, dataloader) + trainer = Trainer() + train_dataloader = DataLoader(RandomDataset(32, 0)) + val_dataloader = DataLoader(RandomDataset(32, 0)) + trainer._data_connector.attach_data(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader) + + with pytest.warns(UserWarning, match="Total length of `CombinedLoader` across ranks is zero"): + trainer.reset_train_dataloader(model) + assert trainer.num_training_batches == 0 + + with pytest.warns(UserWarning, match="Total length of `DataLoader` across ranks is zero"): + trainer.reset_val_dataloader(model) + assert trainer.num_val_batches == [0] @RunIf(skip_windows=True) @@ -962,7 +967,7 @@ def test_inf_dataloader_raise_error_with_partial_batch_limits(tmpdir, stage, dat trainer = Trainer(**trainer_kwargs) trainer_fn = "fit" if stage == RunningStage.TRAINING else stage.value - with pytest.raises(MisconfigurationException, match=r"using an IterableDataset .* must be `1.0` or an int"): + with pytest.raises(MisconfigurationException, match=r"IterableDataset`.*limit_.*_batches\)`.*`1.0` or an int"): getattr(trainer, trainer_fn)(model) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 316384a20c219..8d09b16094f41 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1416,58 +1416,6 @@ def test_predict_return_predictions_cpu(return_predictions, precision, tmpdir): assert preds[0].dtype == (torch.float64 if precision == 64 else torch.float32) -@pytest.mark.parametrize( - ["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"], - [(0.2, 0, 0, 0, False), (0.5, 10, 2, 5, True)], -) -def test_disabled_training_for_insufficient_limit_train_batches( - tmpdir, limit_train_batches, global_step, num_training_batches, current_epoch, should_train -): - """Verify when `limit_train_batches` is float & between [0.0, 1.0] and. - - `int(self.num_training_batches * self.limit_train_batches) == 0`, the training loop is disabled. - """ - - class CurrentModel(BoringModel): - - training_step_invoked = False - training_epoch_end_invoked = False - - def training_step(self, *args, **kwargs): - self.training_step_invoked = True - return super().training_step(*args, **kwargs) - - def training_epoch_end(self, *args, **kwargs): - self.training_epoch_end_invoked = True - return super().training_epoch_end(*args, **kwargs) - - dataset_len = 100 - batch_size = 25 - - train = RandomDataset(32, length=dataset_len) - train_loader = DataLoader(train, batch_size=batch_size) - - model = CurrentModel() - - trainer = Trainer(default_root_dir=tmpdir, max_epochs=5, limit_train_batches=limit_train_batches) - trainer.fit(model, train_loader) - - params_string = f"""`limit_train_batches={limit_train_batches}`, `dataset_len={dataset_len}` - & `batch_size={batch_size}` as - `num_training_batches={num_training_batches}`""" - if should_train: - error_string = f"should run with {params_string}" - else: - error_string = f"should not run with {params_string}" - - assert trainer.state.finished, f"Training failed with {trainer.state}" - assert trainer.global_step == global_step - assert trainer.num_training_batches == num_training_batches - assert trainer.current_epoch == current_epoch - assert model.training_step_invoked == should_train, f"`training_step` {error_string}" - assert model.training_epoch_end_invoked == should_train, f"`training_epoch_end` {error_string}" - - @pytest.mark.parametrize(["max_steps", "max_epochs", "global_step"], [(10, 5, 10), (20, None, 20)]) def test_repeated_fit_calls_with_max_epochs_and_steps(tmpdir, max_steps, max_epochs, global_step): """Ensure that the training loop is bound by `max_steps` and `max_epochs` for repeated calls of `trainer.fit`,