huggingface · muellerzr · Jul 1, 2024 · May 11, 2024 · Jun 13, 2024 · jmracek
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -2952,6 +2952,7 @@ def _inner(folder):
             schedulers,
             dataloaders,
             self.state.process_index,
+            self.step,
             self.scaler,
             save_on_each_node=self.project_configuration.save_on_each_node,
             safe_serialization=safe_serialization,
@@ -3099,7 +3100,7 @@ def _inner(folder):
             else:
                 map_location = "cpu"
 
-        load_accelerator_state(
+        override_attributes = load_accelerator_state(
             input_dir,
             models,
             optimizers,
@@ -3110,6 +3111,7 @@ def _inner(folder):
             map_location,
             **load_model_func_kwargs,
         )
+        self.step = override_attributes["step"]
         custom_checkpoints = [
             f for f in os.listdir(input_dir) if re.search(r"^custom_checkpoint_\d+\.pkl$", f) is not None
         ]

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
@@ -55,6 +55,7 @@ def save_accelerator_state(
     schedulers: list,
     dataloaders: list,
     process_index: int,
+    step: int,
     scaler: GradScaler = None,
     save_on_each_node: bool = False,
     safe_serialization: bool = True,
@@ -82,6 +83,8 @@ def save_accelerator_state(
             A list of dataloader instances to save their sampler states
         process_index (`int`):
             The current process index in the Accelerator state
+        step (`int`):
+            The current step in the internal step tracker
         scaler (`torch.cuda.amp.GradScaler`, *optional*):
             An optional gradient scaler instance to save
         save_on_each_node (`bool`, *optional*):
@@ -134,6 +137,7 @@ def save_accelerator_state(
     # Random number generator states
     states = {}
     states_name = f"{RNG_STATE_NAME}_{process_index}.pkl"
+    states["step"] = step
     states["random_state"] = random.getstate()
     states["numpy_random_seed"] = np.random.get_state()
     states["torch_manual_seed"] = torch.get_rng_state()
@@ -181,6 +185,8 @@ def load_accelerator_state(
         load_model_func_kwargs (`dict`, *optional*):
             Additional arguments that can be passed to the model's `load_state_dict` method.
     """
+    # stores the `Accelerator` attributes to override
+    override_attributes = dict()
     if map_location not in [None, "cpu", "on_device"]:
         raise TypeError(
             "Unsupported optimizer map location passed, please choose one of `None`, `'cpu'`, or `'on_device'`"
@@ -240,6 +246,7 @@ def load_accelerator_state(
     # Random states
     try:
         states = torch.load(input_dir.joinpath(f"{RNG_STATE_NAME}_{process_index}.pkl"))
+        override_attributes["step"] = states["step"]
         random.setstate(states["random_state"])
         np.random.set_state(states["numpy_random_seed"])
         torch.set_rng_state(states["torch_manual_seed"])
@@ -253,6 +260,8 @@ def load_accelerator_state(
     except Exception:
         logger.info("Could not load random states")
 
+    return override_attributes
+
 
 def save_custom_state(obj, path, index: int = 0, save_on_each_node: bool = False):
     """