Lightning-AI · awaelchli · Mar 13, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
@@ -54,6 +54,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed infinite recursion error in precision plugin graveyard ([#19542](https://github.com/Lightning-AI/pytorch-lightning/pull/19542))
 
 
+- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524))
+
 
 ## [2.2.0] - 2024-02-08
 

@@ -568,7 +568,8 @@ def save_checkpoint(
 
             converted_state = {"model": checkpoint.pop("state_dict")}
             converted_state.update({
-                f"optimizer_{idx}": optim_state for idx, optim_state in enumerate(checkpoint.pop("optimizer_states"))
+                f"optimizer_{idx}": optim_state
+                for idx, optim_state in enumerate(checkpoint.pop("optimizer_states", []))
             })
 
             _distributed_checkpoint_save(converted_state, path)

@@ -185,7 +185,8 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
     trainer.save_checkpoint(model_path.with_name("after-test"))
     trainer.save_checkpoint(model_path, weights_only=True)
 
-    _assert_save_equality(trainer, model_path, cls=model.__class__)
+    if not model_path.is_dir():  # TODO (@awaelchli): Add support for asserting equality of sharded checkpoints
+        _assert_save_equality(trainer, model_path, cls=model.__class__)
 
     with torch.inference_mode():
         # Test entry point
@@ -277,13 +278,14 @@ def training_step(self, batch, batch_idx):
     trainer.fit(model)
 
 
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))])
-def test_fsdp_strategy_checkpoint(tmpdir, precision):
+@pytest.mark.parametrize("state_dict_type", ("sharded", "full"))
+def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
     model = TestFSDPModel()
+    strategy = FSDPStrategy(state_dict_type=state_dict_type)
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy="fsdp", precision=precision, max_epochs=1
+        default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy=strategy, precision=precision, max_epochs=1
     )
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -54,6 +54,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Fixed infinite recursion error in precision plugin graveyard ([#19542](https://github.com/Lightning-AI/pytorch-lightning/pull/19542))


		- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524))


		## [2.2.0] - 2024-02-08

Expand Down