Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increment epochs based on last_batch() instead of at the end of the train loop. #3668

Merged
merged 3 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions ludwig/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,7 @@ def train(

self.callback(lambda c: c.on_epoch_start(self, progress_tracker, save_path))

# Trains over a full epoch of data.
# Trains over a full epoch of data or up to the last training step, whichever is sooner.
should_break = self._train_loop(
batcher,
progress_tracker,
Expand All @@ -934,10 +934,6 @@ def train(
profiler,
)

# ================ Post Training Epoch ================
progress_tracker.epoch += 1
self.callback(lambda c: c.on_epoch_end(self, progress_tracker, save_path))

if self.is_coordinator():
# ========== Save training progress ==========
logger.debug(
Expand Down Expand Up @@ -1114,8 +1110,16 @@ def _train_loop(
# batch duration measurements when using timer callbacks.
self.callback(lambda c: c.on_batch_end(self, progress_tracker, save_path, sync_step=should_step))

if batcher.last_batch():
# We have completed an epoch, so we need to increment the epoch counter. It's important to do this here
# instead of outside of the train loop since it's possible the train loop will exit early due to
# early stopping, or step-based training.
progress_tracker.epoch += 1
self.callback(lambda c: c.on_epoch_end(self, progress_tracker, save_path))

if progress_tracker.steps % final_steps_per_checkpoint == 0:
if not self.skip_all_evaluation:
# Publishes metrics to MLFLow if there are any MLFlow callbacks.
should_break = self.run_evaluation(
training_set,
validation_set,
Expand Down
33 changes: 29 additions & 4 deletions tests/integration_tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,9 +563,7 @@ def test_api_callbacks_default_train_steps(tmpdir, csv_filename):


def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
# If train_steps is set manually, epochs is ignored.
train_steps = 100
epochs = 2
batch_size = 8
num_examples = 80
mock_callback = mock.Mock(wraps=Callback())
Expand All @@ -576,7 +574,7 @@ def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
"input_features": input_features,
"output_features": output_features,
"combiner": {"type": "concat", "output_size": 14},
TRAINER: {"epochs": epochs, "train_steps": train_steps, "batch_size": batch_size},
TRAINER: {"train_steps": train_steps, "batch_size": batch_size},
}
model = LudwigModel(config, callbacks=[mock_callback])
model.train(
Expand All @@ -589,6 +587,33 @@ def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
assert mock_callback.on_epoch_start.call_count == 10


def test_api_callbacks_fixed_train_steps_partial_epochs(tmpdir, csv_filename):
# If train_steps is set manually, epochs is ignored.
train_steps = 95
epochs = 2
batch_size = 8
num_examples = 80
mock_callback = mock.Mock(wraps=Callback())

input_features = [sequence_feature(encoder={"reduce_output": "sum"})]
output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")]
config = {
"input_features": input_features,
"output_features": output_features,
"combiner": {"type": "concat", "output_size": 14},
TRAINER: {"epochs": epochs, "train_steps": train_steps, "batch_size": batch_size},
}
model = LudwigModel(config, callbacks=[mock_callback])
model.train(
training_set=generate_data(
input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=num_examples
)
)

# There are 10 steps per epoch, so 95 train steps => 9 full epochs.
assert mock_callback.on_epoch_end.call_count == 9


def test_api_callbacks_batch_size_1(tmpdir, csv_filename):
epochs = 2
batch_size = 1
Expand Down Expand Up @@ -645,7 +670,7 @@ def test_api_callbacks_fixed_train_steps_less_than_one_epoch(tmpdir, csv_filenam
)

assert mock_callback.on_epoch_start.call_count == 1
assert mock_callback.on_epoch_end.call_count == 1
assert mock_callback.on_epoch_end.call_count == 0
# The total number of batches is the number of train_steps
assert mock_callback.on_batch_end.call_count == total_batches
# The total number of evals is the number of times checkpoints are made
Expand Down