ludwig-ai · justinxzhao · Sep 27, 2023 · Sep 26, 2023 · Sep 27, 2023 · Sep 27, 2023
@@ -912,7 +912,7 @@ def train(
 
                     self.callback(lambda c: c.on_epoch_start(self, progress_tracker, save_path))
 
-                    # Trains over a full epoch of data.
+                    # Trains over a full epoch of data or up to the last training step, whichever is sooner.
                     should_break = self._train_loop(
                         batcher,
                         progress_tracker,
@@ -934,10 +934,6 @@ def train(
                         profiler,
                     )
 
-                    # ================ Post Training Epoch ================
-                    progress_tracker.epoch += 1
-                    self.callback(lambda c: c.on_epoch_end(self, progress_tracker, save_path))
-
                     if self.is_coordinator():
                         # ========== Save training progress ==========
                         logger.debug(
@@ -1114,8 +1110,16 @@ def _train_loop(
             # batch duration measurements when using timer callbacks.
             self.callback(lambda c: c.on_batch_end(self, progress_tracker, save_path, sync_step=should_step))
 
+            if batcher.last_batch():
+                # We have completed an epoch, so we need to increment the epoch counter. It's important to do this here
+                # instead of outside of the train loop since it's possible the train loop will exit early due to
+                # early stopping, or step-based training.
+                progress_tracker.epoch += 1
+                self.callback(lambda c: c.on_epoch_end(self, progress_tracker, save_path))
+
             if progress_tracker.steps % final_steps_per_checkpoint == 0:
                 if not self.skip_all_evaluation:
+                    # Publishes metrics to MLFLow if there are any MLFlow callbacks.
                     should_break = self.run_evaluation(
                         training_set,
                         validation_set,

@@ -563,9 +563,7 @@ def test_api_callbacks_default_train_steps(tmpdir, csv_filename):
 
 
 def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
-    # If train_steps is set manually, epochs is ignored.
     train_steps = 100
-    epochs = 2
     batch_size = 8
     num_examples = 80
     mock_callback = mock.Mock(wraps=Callback())
@@ -576,7 +574,7 @@ def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
         "input_features": input_features,
         "output_features": output_features,
         "combiner": {"type": "concat", "output_size": 14},
-        TRAINER: {"epochs": epochs, "train_steps": train_steps, "batch_size": batch_size},
+        TRAINER: {"train_steps": train_steps, "batch_size": batch_size},
     }
     model = LudwigModel(config, callbacks=[mock_callback])
     model.train(
@@ -589,6 +587,33 @@ def test_api_callbacks_fixed_train_steps(tmpdir, csv_filename):
     assert mock_callback.on_epoch_start.call_count == 10
 
 
+def test_api_callbacks_fixed_train_steps_partial_epochs(tmpdir, csv_filename):
+    # If train_steps is set manually, epochs is ignored.
+    train_steps = 95
+    epochs = 2
+    batch_size = 8
+    num_examples = 80
+    mock_callback = mock.Mock(wraps=Callback())
+
+    input_features = [sequence_feature(encoder={"reduce_output": "sum"})]
+    output_features = [category_feature(decoder={"vocab_size": 5}, reduce_input="sum")]
+    config = {
+        "input_features": input_features,
+        "output_features": output_features,
+        "combiner": {"type": "concat", "output_size": 14},
+        TRAINER: {"epochs": epochs, "train_steps": train_steps, "batch_size": batch_size},
+    }
+    model = LudwigModel(config, callbacks=[mock_callback])
+    model.train(
+        training_set=generate_data(
+            input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=num_examples
+        )
+    )
+
+    # There are 10 steps per epoch, so 95 train steps => 9 full epochs.
+    assert mock_callback.on_epoch_end.call_count == 9
+
+
 def test_api_callbacks_batch_size_1(tmpdir, csv_filename):
     epochs = 2
     batch_size = 1
@@ -645,7 +670,7 @@ def test_api_callbacks_fixed_train_steps_less_than_one_epoch(tmpdir, csv_filenam
     )
 
     assert mock_callback.on_epoch_start.call_count == 1
-    assert mock_callback.on_epoch_end.call_count == 1
+    assert mock_callback.on_epoch_end.call_count == 0
     # The total number of batches is the number of train_steps
     assert mock_callback.on_batch_end.call_count == total_batches
     # The total number of evals is the number of times checkpoints are made