huggingface · younesbelkada · Mar 28, 2023 · Feb 28, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/docs/source/customization.mdx b/docs/source/customization.mdx
@@ -149,4 +149,17 @@ When training large models, you should better handle the CUDA cache by iterative
 
 ```python
 config = PPOConfig(..., optimize_cuda_cache=True)
+```
+
+## Use correctly DeepSpeed stage 3:
+
+A small tweak need to be added to your training script to use DeepSpeed stage 3 correctly. You need to properly initialize your reward model on the correct device using the `zero3_init_context_manager` context manager. Here is an example adapted for the `gpt2-sentiment` script:
+
+```python
+ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
+if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
+    with ds_plugin.zero3_init_context_manager(enable=False):
+        sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
+else:
+    sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
 ```
diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py
@@ -265,16 +265,33 @@ def __init__(
         else:
             self.kl_ctl = FixedKLController(self.config.init_kl_coef)
 
+        # Safety checkers for DS integration
+        is_deepspeed_used = self.accelerator.distributed_type == "DEEPSPEED" and hasattr(
+            self.accelerator.state, "deepspeed_plugin"
+        )
+
         (
             self.model,
-            self.ref_model,
             self.optimizer,
             self.data_collator,
             self.dataloader,
             self.lr_scheduler,
         ) = self.accelerator.prepare(
-            self.model, self.ref_model, self.optimizer, self.data_collator, self.dataloader, self.lr_scheduler
+            self.model, self.optimizer, self.data_collator, self.dataloader, self.lr_scheduler
         )
+        if is_deepspeed_used:
+            # 8 bit models are already set on the correct device
+            if not getattr(self.ref_model.pretrained_model, "is_loaded_in_8bit", False):
+                # DS integration only allows for single model and as `ref_model` is only used for
+                # `KL devergence loss`,i.e, in eval model, just have it be on the respective device and
+                # there is no need to pass it to the `accelerator.prepare` call
+                self.ref_model = self.ref_model.to(self.accelerator.device)
+
+            # this hack seems to be needed for DS stage 3 to work
+            if self.accelerator.state.deepspeed_plugin.zero_stage == 3:
+                self.model.train()
+        else:
+            self.ref_model = self.accelerator.prepare(self.ref_model)
 
         # In a distributed setup, only logging needs to be performed on the main process
         # check: https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html