[Llama] Support pp + no_recompute_layer. (PaddlePaddle#9373)

Co-authored-by: 周天宇 <tianyu.zhou@iluvatar.com>
blacksheep-Aristotle · Nov 6, 2024 · 2f0b407 · 2f0b407
1 parent 3971fc7
commit 2f0b407
Showing 1 changed file with 6 additions and 3 deletions.
diff --git a/paddlenlp/transformers/llama/modeling_pp.py b/paddlenlp/transformers/llama/modeling_pp.py
@@ -242,7 +242,12 @@ def forward(self, args):
                 attn_mask_startend_row_indices = None
 
         has_gradient = not hidden_states.stop_gradient
-        if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and self.config.recompute_granularity == "full"
+            and has_gradient
+        ):
             if attention_mask is not None or alibi is not None or attn_mask_startend_row_indices is not None:
                 hidden_states = recompute(
                     super().forward,
@@ -340,8 +345,6 @@ def __init__(self, config):
         self.recompute_granularity = self.config.recompute_granularity
         self.pp_recompute_interval = self.config.pp_recompute_interval
         self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
-        if self.recompute_granularity == "full":
-            assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support"
 
         virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)