From 5d7c59ecb7529b24e449e9a543aea530c42fa230 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Wed, 22 Jan 2020 16:14:51 +0200 Subject: [PATCH 01/17] Added max number of steps in Trainer --- pytorch_lightning/trainer/trainer.py | 3 +++ pytorch_lightning/trainer/training_loop.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e1c1464c73cf8..0e0c058a17dcc 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -71,6 +71,7 @@ def __init__( min_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 max_epochs=1000, min_epochs=1, + max_steps=None, train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, @@ -505,6 +506,8 @@ def __init__( min_epochs = min_nb_epochs self.min_epochs = min_epochs + self.max_steps = max_steps + # Backward compatibility if nb_sanity_val_steps is not None: warnings.warn("`nb_sanity_val_steps` has renamed to `num_sanity_val_steps` since v0.5.0" diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index cad5650aa4f6d..853b71df28f1c 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -345,6 +345,11 @@ def train(self): raise MisconfigurationException(m) self.reduce_lr_on_plateau_scheduler.step(val_loss, epoch=self.current_epoch) + if self.max_steps is not None and self.max_steps == self.global_step: + self.main_progress_bar.close() + model.on_train_end() + return + # early stopping met_min_epochs = epoch >= self.min_epochs - 1 if (self.enable_early_stop and not self.disable_validation and is_val_epoch and @@ -421,6 +426,10 @@ def run_training_epoch(self): self.global_step += 1 self.total_batch_idx += 1 + # max steps reached, end training + if self.max_steps is not None and self.max_steps == self.global_step: + break + # end epoch early # stop when the flag is changed or we've gone past the amount # requested in the batches From 47b4119f25328823d5a070840da46c8d925d8567 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Tue, 28 Jan 2020 11:24:44 +0200 Subject: [PATCH 02/17] Added docstring --- pytorch_lightning/trainer/trainer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0e0c058a17dcc..7e81036de1dc1 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -290,6 +290,16 @@ def __init__( .. deprecated:: 0.5.0 Use `min_nb_epochs` instead. Will remove 0.8.0. + max_steps (int): Stop training after this number of steps. + Example:: + + # default used by the Trainer (disabled) + trainer = Trainer(max_steps=None) + + # Stop after 100 steps (batches) + trainer = Trainer(max_steps=100) + + train_percent_check (int): How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch. Example:: From 00c37389ebf324fcbf5dd67e49b46da37992ff49 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Tue, 28 Jan 2020 12:01:41 +0200 Subject: [PATCH 03/17] Fix flake8 errors --- pytorch_lightning/trainer/trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7e81036de1dc1..63828a814ef8a 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -292,13 +292,12 @@ def __init__( max_steps (int): Stop training after this number of steps. Example:: - + # default used by the Trainer (disabled) trainer = Trainer(max_steps=None) - + # Stop after 100 steps (batches) trainer = Trainer(max_steps=100) - train_percent_check (int): How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch. From e468f3163787b6a7b3c489a1b2f2ce8542b11a38 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Thu, 30 Jan 2020 12:32:21 +0200 Subject: [PATCH 04/17] Clarified docstrings --- pytorch_lightning/trainer/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 63828a814ef8a..22a23756f85ab 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -290,7 +290,9 @@ def __init__( .. deprecated:: 0.5.0 Use `min_nb_epochs` instead. Will remove 0.8.0. - max_steps (int): Stop training after this number of steps. + max_steps (int): Stop training after this number of steps. Disabled by default (None). + Training will stop if max_epochs is reached prior to max_steps. + Training will stop despite not running for min_epochs. Example:: # default used by the Trainer (disabled) From 62b0b080ac496f2538f834cbfc275850f6a195f8 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Thu, 30 Jan 2020 15:13:56 +0200 Subject: [PATCH 05/17] Fixed flake8 error --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 22a23756f85ab..3d1f1c60649a1 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -292,7 +292,7 @@ def __init__( max_steps (int): Stop training after this number of steps. Disabled by default (None). Training will stop if max_epochs is reached prior to max_steps. - Training will stop despite not running for min_epochs. + Training will stop despite not running for min_epochs. Example:: # default used by the Trainer (disabled) From 56fc41041424c4c4135ed1a2e57cbc3bdd99fdfa Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Sun, 2 Feb 2020 15:20:25 +0200 Subject: [PATCH 06/17] Added min_steps to Trainer --- pytorch_lightning/trainer/trainer.py | 17 ++++++++++++++--- pytorch_lightning/trainer/training_loop.py | 4 +++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 22a23756f85ab..74b6404bb0659 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -72,6 +72,7 @@ def __init__( max_epochs=1000, min_epochs=1, max_steps=None, + min_steps=None, train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, @@ -291,16 +292,25 @@ def __init__( Use `min_nb_epochs` instead. Will remove 0.8.0. max_steps (int): Stop training after this number of steps. Disabled by default (None). - Training will stop if max_epochs is reached prior to max_steps. - Training will stop despite not running for min_epochs. + Training will stop if max_steps or max_epochs have reached (earliest). Example:: # default used by the Trainer (disabled) trainer = Trainer(max_steps=None) - # Stop after 100 steps (batches) + # Stop after 100 steps trainer = Trainer(max_steps=100) + min_steps(int): Force training for at least these number of steps. + Trainer will train model for at least min_steps or min_epochs (latest). + Example:: + + # default used by the Trainer (disabled) + trainer = Trainer(min_steps=None) + + # Run at least for 100 steps (disable min_epochs) + trainer = Trainer(min_steps=100, min_epochs=0) + train_percent_check (int): How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch. Example:: @@ -518,6 +528,7 @@ def __init__( self.min_epochs = min_epochs self.max_steps = max_steps + self.min_steps = min_steps # Backward compatibility if nb_sanity_val_steps is not None: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 853b71df28f1c..4b2da43df16b7 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -352,8 +352,10 @@ def train(self): # early stopping met_min_epochs = epoch >= self.min_epochs - 1 + met_min_steps = self.global_step >= self.min_steps if self.min_step is not None else True + if (self.enable_early_stop and not self.disable_validation and is_val_epoch and - (met_min_epochs or self.fast_dev_run)): + (met_min_epochs or met_min_steps or self.fast_dev_run)): should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch, logs=self.callback_metrics) # stop training From b9e91d4226b06c78ec384ced2c6c0fe3905e3c2c Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Mon, 3 Feb 2020 13:01:17 +0200 Subject: [PATCH 07/17] Added steps and epochs test --- pytorch_lightning/trainer/training_loop.py | 4 +- tests/test_trainer.py | 54 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 4b2da43df16b7..3031667bcfbc8 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -352,10 +352,10 @@ def train(self): # early stopping met_min_epochs = epoch >= self.min_epochs - 1 - met_min_steps = self.global_step >= self.min_steps if self.min_step is not None else True + met_min_steps = self.global_step >= self.min_steps if self.min_steps is not None else True if (self.enable_early_stop and not self.disable_validation and is_val_epoch and - (met_min_epochs or met_min_steps or self.fast_dev_run)): + ((met_min_epochs and met_min_steps) or self.fast_dev_run)): should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch, logs=self.callback_metrics) # stop training diff --git a/tests/test_trainer.py b/tests/test_trainer.py index ddc74b40f8ca4..e9a751db83d3c 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -6,6 +6,7 @@ import tests.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ( + EarlyStopping, ModelCheckpoint, ) from pytorch_lightning.testing import ( @@ -410,6 +411,59 @@ class CurrentTestModel( # run the test method trainer.test() +def test_num_trainer_steps(tmpdir): + """Verify model trains according to speficied steps""" + tutils.reset_seed() + model, _ = tutils.get_model() + + trainer_options = dict( + max_epochs=5, + gpus=None, + default_save_path=tmpdir, + train_percent_check=0.05, + ) + + trainer_options['max_epochs'] = 2 + trainer_options['max_steps'] = 100 + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + assert result == 1 + # should stop at max_steps + assert trainer.global_step == 100, "Model did not stop at max_steps" + + trainer_options['max_epochs'] = 2 + trainer_options['max_steps'] = 500 + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + assert result == 1 + # should stop at max_epochs + assert trainer.global_step == 93 * 2 and \ + trainer.current_epoch == 1, "Model did not stop at max_epochs" + + stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) + trainer_options['early_stop_callback'] = stopping + trainer_options['min_epochs'] = 1 + trainer_options['min_steps'] = 10 + trainer_options['max_epochs'] = 10 + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + assert result == 1 + # should run at least 1 epoch + assert trainer.global_step >= 93 and \ + trainer.current_epoch > 0, "Model did not train for at least min_epochs" + + stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) + trainer_options['early_stop_callback'] = stopping + trainer_options['val_check_interval'] = 20 + trainer_options['min_epochs'] = 1 + trainer_options['min_steps'] = 100 + trainer_options['max_epochs'] = 10 + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + assert result == 1 + # should run at least 100 steps + assert trainer.global_step >= 100 and \ + trainer.current_epoch > 0, "Model did not train for at least min_steps" # if __name__ == '__main__': # pytest.main([__file__]) From 203a4419a5c0a35dd4ff52a7ccc2593cd3d83480 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Mon, 3 Feb 2020 13:44:48 +0200 Subject: [PATCH 08/17] flake8 --- tests/test_trainer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index e9a751db83d3c..fa40e2c7b6767 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -411,18 +411,19 @@ class CurrentTestModel( # run the test method trainer.test() + def test_num_trainer_steps(tmpdir): """Verify model trains according to speficied steps""" tutils.reset_seed() model, _ = tutils.get_model() - + trainer_options = dict( max_epochs=5, gpus=None, default_save_path=tmpdir, train_percent_check=0.05, ) - + trainer_options['max_epochs'] = 2 trainer_options['max_steps'] = 100 trainer = Trainer(**trainer_options) @@ -430,7 +431,7 @@ def test_num_trainer_steps(tmpdir): assert result == 1 # should stop at max_steps assert trainer.global_step == 100, "Model did not stop at max_steps" - + trainer_options['max_epochs'] = 2 trainer_options['max_steps'] = 500 trainer = Trainer(**trainer_options) @@ -451,7 +452,7 @@ def test_num_trainer_steps(tmpdir): # should run at least 1 epoch assert trainer.global_step >= 93 and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" - + stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['val_check_interval'] = 20 From 90077729640a8bb50072bc008903084222b6480d Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Mon, 3 Feb 2020 16:07:31 +0200 Subject: [PATCH 09/17] minor fix --- tests/test_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index fa40e2c7b6767..96308f31ea4a1 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -441,6 +441,7 @@ def test_num_trainer_steps(tmpdir): assert trainer.global_step == 93 * 2 and \ trainer.current_epoch == 1, "Model did not stop at max_epochs" + trainer.pop("max_steps", None) stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['min_epochs'] = 1 From 9d6590a747d9baa1fd883b772fc1cfcc649da447 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Mon, 3 Feb 2020 16:59:36 +0200 Subject: [PATCH 10/17] fix steps test in test_trainer --- tests/test_trainer.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index fa40e2c7b6767..84fac90fe3c86 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -1,3 +1,4 @@ +import math import os import pytest @@ -417,53 +418,55 @@ def test_num_trainer_steps(tmpdir): tutils.reset_seed() model, _ = tutils.get_model() + train_percent = 0.05 + num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) + trainer_options = dict( - max_epochs=5, gpus=None, default_save_path=tmpdir, - train_percent_check=0.05, + train_percent_check=train_percent, ) - - trainer_options['max_epochs'] = 2 - trainer_options['max_steps'] = 100 + + trainer_options['max_epochs'] = 5 + trainer_options['max_steps'] = num_train_samples + 10 trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # should stop at max_steps - assert trainer.global_step == 100, "Model did not stop at max_steps" + assert trainer.global_step == trainer_options['max_steps'], "Model did not stop at max_steps" trainer_options['max_epochs'] = 2 - trainer_options['max_steps'] = 500 + trainer_options['max_steps'] = trainer_options['max_epochs'] * 2 * num_train_samples trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # should stop at max_epochs - assert trainer.global_step == 93 * 2 and \ - trainer.current_epoch == 1, "Model did not stop at max_epochs" + assert trainer.global_step == num_train_samples * trainer_options['max_epochs'] and \ + trainer.current_epoch == trainer_options['max_epochs'] - 1, "Model did not stop at max_epochs" stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['min_epochs'] = 1 - trainer_options['min_steps'] = 10 + trainer_options['min_steps'] = math.floor(num_train_samples / 2) trainer_options['max_epochs'] = 10 trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # should run at least 1 epoch - assert trainer.global_step >= 93 and \ + assert trainer.global_step >= num_train_samples and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['val_check_interval'] = 20 trainer_options['min_epochs'] = 1 - trainer_options['min_steps'] = 100 + trainer_options['min_steps'] = math.floor(num_train_samples * 1.5) trainer_options['max_epochs'] = 10 trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # should run at least 100 steps - assert trainer.global_step >= 100 and \ + assert trainer.global_step >= math.floor(num_train_samples * 1.5) and \ trainer.current_epoch > 0, "Model did not train for at least min_steps" # if __name__ == '__main__': From 24d967e87842a56b77df6e24987b82e23b4680e7 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Sun, 9 Feb 2020 11:29:18 +0200 Subject: [PATCH 11/17] Split steps test into 2 tests --- tests/test_trainer.py | 52 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 049effefd8eab..077f3c13df5b1 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -413,60 +413,96 @@ class CurrentTestModel( trainer.test() -def test_num_trainer_steps(tmpdir): - """Verify model trains according to speficied steps""" +def test_trainer_max_steps_and_epochs(tmpdir): + """Verify model trains according to speficied max steps""" tutils.reset_seed() model, _ = tutils.get_model() + # define train epoch to 5% of data train_percent = 0.05 + # get number of samples in 1 epoch num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) trainer_options = dict( - gpus=None, default_save_path=tmpdir, train_percent_check=train_percent, ) + # define less train steps than epochs trainer_options['max_epochs'] = 5 trainer_options['max_steps'] = num_train_samples + 10 + + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + + # check training stopped at max_steps assert result == 1 - # should stop at max_steps assert trainer.global_step == trainer_options['max_steps'], "Model did not stop at max_steps" + # define less train epochs than steps trainer_options['max_epochs'] = 2 trainer_options['max_steps'] = trainer_options['max_epochs'] * 2 * num_train_samples + + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 - # should stop at max_epochs + + # check training stopped at max_epochs assert trainer.global_step == num_train_samples * trainer_options['max_epochs'] and \ trainer.current_epoch == trainer_options['max_epochs'] - 1, "Model did not stop at max_epochs" - trainer_options.pop("max_steps", None) + +def test_trainer_min_steps_and_epochs(tmpdir): + """Verify model trains according to speficied min steps""" + tutils.reset_seed() + model, _ = tutils.get_model() + + # define train epoch to 5% of data + train_percent = 0.05 + # get number of samples in 1 epoch + num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) + + trainer_options = dict( + default_save_path=tmpdir, + train_percent_check=train_percent, + ) + + # define callback for stopping the model stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping + trainer_options['val_check_interval'] = 20 + + # define less min steps than 1 epoch trainer_options['min_epochs'] = 1 trainer_options['min_steps'] = math.floor(num_train_samples / 2) trainer_options['max_epochs'] = 10 + + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 - # should run at least 1 epoch + + # check model ran for at least min_epochs assert trainer.global_step >= num_train_samples and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['val_check_interval'] = 20 + + # define less epochs than min_steps trainer_options['min_epochs'] = 1 trainer_options['min_steps'] = math.floor(num_train_samples * 1.5) trainer_options['max_epochs'] = 10 + + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 - # should run at least 100 steps + + # check model ran for at least num_train_samples*1.5 assert trainer.global_step >= math.floor(num_train_samples * 1.5) and \ trainer.current_epoch > 0, "Model did not train for at least min_steps" From b967119cd513d2d767e8ed4baf0ac2393ee02850 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Tue, 11 Feb 2020 11:35:36 +0200 Subject: [PATCH 12/17] Refactor steps test --- tests/test_trainer.py | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 077f3c13df5b1..3b13924fcf78c 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -413,8 +413,8 @@ class CurrentTestModel( trainer.test() -def test_trainer_max_steps_and_epochs(tmpdir): - """Verify model trains according to speficied max steps""" +def _init_steps_model(): + """private method for initializing a model with 5% train epochs""" tutils.reset_seed() model, _ = tutils.get_model() @@ -424,9 +424,14 @@ def test_trainer_max_steps_and_epochs(tmpdir): num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) trainer_options = dict( - default_save_path=tmpdir, train_percent_check=train_percent, ) + return model, trainer_options, num_train_samples + + +def test_trainer_max_steps_and_epochs(tmpdir): + """Verify model trains according to speficied max steps""" + model, trainer_options, num_train_samples = _init_steps_model() # define less train steps than epochs trainer_options['max_epochs'] = 5 @@ -456,28 +461,17 @@ def test_trainer_max_steps_and_epochs(tmpdir): def test_trainer_min_steps_and_epochs(tmpdir): """Verify model trains according to speficied min steps""" - tutils.reset_seed() - model, _ = tutils.get_model() + model, trainer_options, num_train_samples = _init_steps_model() - # define train epoch to 5% of data - train_percent = 0.05 - # get number of samples in 1 epoch - num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) - - trainer_options = dict( - default_save_path=tmpdir, - train_percent_check=train_percent, - ) - - # define callback for stopping the model + # define callback for stopping the model and default epochs stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['early_stop_callback'] = stopping trainer_options['val_check_interval'] = 20 + trainer_options['min_epochs'] = 1 + trainer_options['max_epochs'] = 10 # define less min steps than 1 epoch - trainer_options['min_epochs'] = 1 trainer_options['min_steps'] = math.floor(num_train_samples / 2) - trainer_options['max_epochs'] = 10 # fit model trainer = Trainer(**trainer_options) @@ -488,14 +482,8 @@ def test_trainer_min_steps_and_epochs(tmpdir): assert trainer.global_step >= num_train_samples and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" - stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) - trainer_options['early_stop_callback'] = stopping - trainer_options['val_check_interval'] = 20 - # define less epochs than min_steps - trainer_options['min_epochs'] = 1 trainer_options['min_steps'] = math.floor(num_train_samples * 1.5) - trainer_options['max_epochs'] = 10 # fit model trainer = Trainer(**trainer_options) From 3440196916195d530c43ca0c306b23651d9a93a2 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Tue, 11 Feb 2020 12:27:35 +0200 Subject: [PATCH 13/17] Update test_trainer.py --- tests/test_trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 3b13924fcf78c..ab93b6236e111 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -464,8 +464,7 @@ def test_trainer_min_steps_and_epochs(tmpdir): model, trainer_options, num_train_samples = _init_steps_model() # define callback for stopping the model and default epochs - stopping = EarlyStopping(monitor='val_loss', min_delta=1.0) - trainer_options['early_stop_callback'] = stopping + trainer_options['early_stop_callback'] = EarlyStopping(monitor='val_loss', min_delta=1.0) trainer_options['val_check_interval'] = 20 trainer_options['min_epochs'] = 1 trainer_options['max_epochs'] = 10 From ec71b4f0647d2816e22544e51b206adde163ae0b Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Thu, 13 Feb 2020 13:08:54 +0200 Subject: [PATCH 14/17] Minor in test_trainer.py --- tests/test_trainer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index de9049e5d961b..4ca2186353c09 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -464,10 +464,12 @@ def test_trainer_min_steps_and_epochs(tmpdir): model, trainer_options, num_train_samples = _init_steps_model() # define callback for stopping the model and default epochs - trainer_options['early_stop_callback'] = EarlyStopping(monitor='val_loss', min_delta=1.0) - trainer_options['val_check_interval'] = 20 - trainer_options['min_epochs'] = 1 - trainer_options['max_epochs'] = 10 + trainer_options.update({ + 'early_stop_callback': EarlyStopping(monitor='val_loss', min_delta=1.0), + 'val_check_interval': 20, + 'min_epochs': 1, + 'max_epochs': 10 + }) # define less min steps than 1 epoch trainer_options['min_steps'] = math.floor(num_train_samples / 2) From 4e7adfabb3aa7dc5fa604e9c85bb1aacd8f26250 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 15 Feb 2020 18:40:50 -0500 Subject: [PATCH 15/17] Update test_trainer.py --- tests/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 4ca2186353c09..43395995f6868 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -455,8 +455,8 @@ def test_trainer_max_steps_and_epochs(tmpdir): assert result == 1 # check training stopped at max_epochs - assert trainer.global_step == num_train_samples * trainer_options['max_epochs'] and \ - trainer.current_epoch == trainer_options['max_epochs'] - 1, "Model did not stop at max_epochs" + assert trainer.global_step == num_train_samples * trainer_options['max_epochs'] + assert trainer.current_epoch == trainer_options['max_epochs'] - 1, "Model did not stop at max_epochs" def test_trainer_min_steps_and_epochs(tmpdir): From 927d18fe3edb4488384c54e20d2973fd7260d318 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Sun, 16 Feb 2020 10:48:21 +0200 Subject: [PATCH 16/17] Address PR comments --- pytorch_lightning/trainer/trainer.py | 8 +------- pytorch_lightning/trainer/training_loop.py | 4 ++-- tests/test_trainer.py | 24 ++++++++++------------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 815b67203cbce..4cb655fae6eaf 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -298,19 +298,13 @@ def __init__( Training will stop if max_steps or max_epochs have reached (earliest). Example:: - # default used by the Trainer (disabled) - trainer = Trainer(max_steps=None) - # Stop after 100 steps trainer = Trainer(max_steps=100) - min_steps(int): Force training for at least these number of steps. + min_steps(int): Force training for at least these number of steps. Disabled by default (None). Trainer will train model for at least min_steps or min_epochs (latest). Example:: - # default used by the Trainer (disabled) - trainer = Trainer(min_steps=None) - # Run at least for 100 steps (disable min_epochs) trainer = Trainer(min_steps=100, min_epochs=0) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9ef9e09389b52..0c433c0df2b0c 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -348,14 +348,14 @@ def train(self): raise MisconfigurationException(m) self.reduce_lr_on_plateau_scheduler.step(val_loss, epoch=self.current_epoch) - if self.max_steps is not None and self.max_steps == self.global_step: + if self.max_steps and self.max_steps == self.global_step: self.main_progress_bar.close() model.on_train_end() return # early stopping met_min_epochs = epoch >= self.min_epochs - 1 - met_min_steps = self.global_step >= self.min_steps if self.min_steps is not None else True + met_min_steps = self.global_step >= self.min_steps if self.min_steps else True if (self.enable_early_stop and not self.disable_validation and is_val_epoch and ((met_min_epochs and met_min_steps) or self.fast_dev_run)): diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 43395995f6868..0275fdd7ca83e 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -430,20 +430,21 @@ def _init_steps_model(): def test_trainer_max_steps_and_epochs(tmpdir): - """Verify model trains according to speficied max steps""" + """Verify model trains according to specified max steps""" model, trainer_options, num_train_samples = _init_steps_model() # define less train steps than epochs - trainer_options['max_epochs'] = 5 - trainer_options['max_steps'] = num_train_samples + 10 + trainer_options.update(dict( + max_epochs=5, + max_steps=num_train_samples + 10 + )) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # check training stopped at max_steps - assert result == 1 - assert trainer.global_step == trainer_options['max_steps'], "Model did not stop at max_steps" + assert result == 1 and trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" # define less train epochs than steps trainer_options['max_epochs'] = 2 @@ -452,15 +453,14 @@ def test_trainer_max_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) - assert result == 1 # check training stopped at max_epochs - assert trainer.global_step == num_train_samples * trainer_options['max_epochs'] - assert trainer.current_epoch == trainer_options['max_epochs'] - 1, "Model did not stop at max_epochs" + assert result == 1 and trainer.global_step == num_train_samples * trainer.max_nb_epochs \ + and trainer.current_epoch == trainer.max_nb_epochs - 1, "Model did not stop at max_epochs" def test_trainer_min_steps_and_epochs(tmpdir): - """Verify model trains according to speficied min steps""" + """Verify model trains according to specified min steps""" model, trainer_options, num_train_samples = _init_steps_model() # define callback for stopping the model and default epochs @@ -477,10 +477,9 @@ def test_trainer_min_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) - assert result == 1 # check model ran for at least min_epochs - assert trainer.global_step >= num_train_samples and \ + assert result == 1 and trainer.global_step >= num_train_samples and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" # define less epochs than min_steps @@ -489,10 +488,9 @@ def test_trainer_min_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) - assert result == 1 # check model ran for at least num_train_samples*1.5 - assert trainer.global_step >= math.floor(num_train_samples * 1.5) and \ + assert result == 1 and trainer.global_step >= math.floor(num_train_samples * 1.5) and \ trainer.current_epoch > 0, "Model did not train for at least min_steps" # if __name__ == '__main__': From b66703c9823d5eb06b3d7f222ad0ecdc0baa1ec7 Mon Sep 17 00:00:00 2001 From: Peter Izsak Date: Tue, 18 Feb 2020 15:11:34 +0200 Subject: [PATCH 17/17] Minor --- tests/test_trainer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index e6371df1d6570..2b064357ea7de 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -478,9 +478,10 @@ def test_trainer_max_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + assert result == 1, "Training did not complete" # check training stopped at max_steps - assert result == 1 and trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" + assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" # define less train epochs than steps trainer_options['max_epochs'] = 2 @@ -489,9 +490,10 @@ def test_trainer_max_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + assert result == 1, "Training did not complete" # check training stopped at max_epochs - assert result == 1 and trainer.global_step == num_train_samples * trainer.max_nb_epochs \ + assert trainer.global_step == num_train_samples * trainer.max_nb_epochs \ and trainer.current_epoch == trainer.max_nb_epochs - 1, "Model did not stop at max_epochs" @@ -513,9 +515,10 @@ def test_trainer_min_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + assert result == 1, "Training did not complete" # check model ran for at least min_epochs - assert result == 1 and trainer.global_step >= num_train_samples and \ + assert trainer.global_step >= num_train_samples and \ trainer.current_epoch > 0, "Model did not train for at least min_epochs" # define less epochs than min_steps @@ -524,9 +527,10 @@ def test_trainer_min_steps_and_epochs(tmpdir): # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) + assert result == 1, "Training did not complete" # check model ran for at least num_train_samples*1.5 - assert result == 1 and trainer.global_step >= math.floor(num_train_samples * 1.5) and \ + assert trainer.global_step >= math.floor(num_train_samples * 1.5) and \ trainer.current_epoch > 0, "Model did not train for at least min_steps" # if __name__ == '__main__':