ludwig-ai · arnavgarg1 · Nov 29, 2022 · Nov 29, 2022 · Nov 29, 2022 · Nov 29, 2022
@@ -50,6 +50,7 @@
 from ludwig.trainers.registry import register_trainer
 from ludwig.utils import time_utils
 from ludwig.utils.checkpoint_utils import Checkpoint, CheckpointManager
+from ludwig.utils.data_utils import load_json
 from ludwig.utils.defaults import default_random_seed
 from ludwig.utils.horovod_utils import return_first
 from ludwig.utils.math_utils import exponential_decay, learning_rate_warmup, learning_rate_warmup_distributed
@@ -1237,9 +1238,16 @@ def set_steps_to_1_or_quit(self, signum, frame):
             sys.exit(1)
 
     def resume_training_progress_tracker(self, training_progress_tracker_path):
+        progress_tracker_dict = None
         if self.is_coordinator():
-            logger.info(f"Resuming training of model: {training_progress_tracker_path}")
-        progress_tracker = ProgressTracker.load(training_progress_tracker_path)
+            logger.info(f"Loading progress tracker for model: {training_progress_tracker_path}")
+            progress_tracker_dict = load_json(training_progress_tracker_path)
+        if self.horovod:
+            logger.debug("Broadcasting model progress tracker dict to all workers")
+            progress_tracker_dict = self.horovod.broadcast_object(
+                progress_tracker_dict, name="broadcast_progress_tracker"
+            )
+        progress_tracker = ProgressTracker.load(progress_tracker_dict)
         return progress_tracker
 
     def resume_weights_and_optimizer(

@@ -12,7 +12,7 @@
 from ludwig.features.base_feature import OutputFeature
 from ludwig.models.base import BaseModel
 from ludwig.modules.metric_modules import get_best_function
-from ludwig.utils.data_utils import load_json, save_json
+from ludwig.utils.data_utils import save_json
 from ludwig.utils.metric_utils import TrainerMetric
 
 logger = logging.getLogger(__name__)
@@ -127,13 +127,10 @@ def save(self, filepath):
         save_json(filepath, self.__dict__)
 
     @staticmethod
-    def load(filepath):
-        loaded = load_json(filepath)
-
+    def load(progress_tracking_dict: Dict):
         from ludwig.utils.backward_compatibility import upgrade_model_progress
 
-        loaded = upgrade_model_progress(loaded)
-
+        loaded = upgrade_model_progress(progress_tracking_dict)
         return ProgressTracker(**loaded)
 
     def log_metrics(self):

@@ -658,6 +658,30 @@ def test_experiment_model_resume(tmpdir):
     shutil.rmtree(output_dir, ignore_errors=True)
 
 
+@pytest.mark.distributed
+def test_experiment_model_resume_distributed(tmpdir, ray_cluster_4cpu):
+    # Single sequence input, single category output
+    # Tests saving a model file, loading it to rerun training and predict
+    input_features = [number_feature()]
+    output_features = [category_feature(output_feature=True)]
+    # Generate test data
+    rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv"))
+
+    config = {
+        "input_features": input_features,
+        "output_features": output_features,
+        "combiner": {"type": "concat", "output_size": 8},
+        TRAINER: {"epochs": 1},
+        "backend": {"type": "ray", "trainer": {"num_workers": 2}},
+    }
+
+    _, _, _, _, output_dir = experiment_cli(config, dataset=rel_path, output_directory=tmpdir)
+
+    experiment_cli(config, dataset=rel_path, model_resume_path=output_dir)
+
+    predict_cli(os.path.join(output_dir, "model"), dataset=rel_path)
+
+
 def test_experiment_various_feature_types(csv_filename):
     input_features = [binary_feature(), bag_feature()]
     output_features = [set_feature(decoder={"max_len": 3, "vocab_size": 5})]