From aeb34f2fe62eb5711e325c648f149b2c622a5593 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Wed, 25 Jan 2023 16:28:16 -0800 Subject: [PATCH 1/3] Cap batch_size=auto at 128 for CPU training --- ludwig/schema/trainer.py | 4 +++- ludwig/trainers/trainer.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ludwig/schema/trainer.py b/ludwig/schema/trainer.py index 695cfc23dd9..98c47bd4f2b 100644 --- a/ludwig/schema/trainer.py +++ b/ludwig/schema/trainer.py @@ -101,7 +101,9 @@ class ECDTrainerConfig(BaseTrainerConfig): allow_none=False, description=( "The number of training examples utilized in one training step of the model. If ’auto’, the " - "biggest batch size (power of 2) that can fit in memory will be used." + "batch size that maximized training throughput (samples / sec) will be used. For CPU training, the " + "tuned batch size is capped at 128 as throughput benefits of large batch sizes are less noticeable without " + "a GPU." ), parameter_metadata=TRAINER_METADATA["batch_size"], field_options=[ diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py index 9cfc97623b0..e717d0b0685 100644 --- a/ludwig/trainers/trainer.py +++ b/ludwig/trainers/trainer.py @@ -69,6 +69,8 @@ ProgressTracker, ) +MAX_CPU_BATCH_SIZE = 128 + logger = logging.getLogger(__name__) @@ -372,11 +374,16 @@ def tune_batch_size( self.skip_save_progress = True self.skip_save_log = True + # When training on CPU, larger batch sizes offer limited benefits due to lack of effective + # parallelization within a batch. As such, to increase changes of stable training, we cap the maximum + # batch size at MAX_CPU_BATCH_SIZE + max_batch_size = self.max_batch_size if torch.cuda.is_available() else MAX_CPU_BATCH_SIZE + self.dist_model.train() # Sets model training mode. evaluator = self._create_batch_size_evaluator() try: - return evaluator.select_best_batch_size(len(training_set), self.max_batch_size, max_trials) + return evaluator.select_best_batch_size(len(training_set), max_batch_size, max_trials) finally: # Restore original parameters to defaults self.skip_save_model = skip_save_model From 13aeece594630ff7eac0725549b68449e6d8a157 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Wed, 25 Jan 2023 20:07:04 -0800 Subject: [PATCH 2/3] Account for max_batch_size --- ludwig/trainers/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py index e717d0b0685..dd16a98de34 100644 --- a/ludwig/trainers/trainer.py +++ b/ludwig/trainers/trainer.py @@ -377,7 +377,9 @@ def tune_batch_size( # When training on CPU, larger batch sizes offer limited benefits due to lack of effective # parallelization within a batch. As such, to increase changes of stable training, we cap the maximum # batch size at MAX_CPU_BATCH_SIZE - max_batch_size = self.max_batch_size if torch.cuda.is_available() else MAX_CPU_BATCH_SIZE + max_batch_size = ( + self.max_batch_size if torch.cuda.is_available() else min(self.max_batch_size, MAX_CPU_BATCH_SIZE) + ) self.dist_model.train() # Sets model training mode. From 05105b8a1eee0654b34783817d86f3ff6350b158 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Thu, 26 Jan 2023 09:39:21 -0800 Subject: [PATCH 3/3] Update ludwig/trainers/trainer.py Co-authored-by: Joppe Geluykens --- ludwig/trainers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py index dd16a98de34..c12cea34561 100644 --- a/ludwig/trainers/trainer.py +++ b/ludwig/trainers/trainer.py @@ -375,7 +375,7 @@ def tune_batch_size( self.skip_save_log = True # When training on CPU, larger batch sizes offer limited benefits due to lack of effective - # parallelization within a batch. As such, to increase changes of stable training, we cap the maximum + # parallelization within a batch. As such, to increase chances of stable training, we cap the maximum # batch size at MAX_CPU_BATCH_SIZE max_batch_size = ( self.max_batch_size if torch.cuda.is_available() else min(self.max_batch_size, MAX_CPU_BATCH_SIZE)