pytorch · ebsmothers · Dec 6, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -516,14 +516,28 @@ def train(self) -> None:
                     input_pos.to(self._device) if input_pos is not None else None
                 )
 
-                logits = self._model(tokens, mask=mask, input_pos=input_pos)
+                logits = self._model(tokens, mask=mask, input_pos=input_pos, output_hidden_states=True)
                 # Shift so that tokens < n predict n
                 logits = logits[..., :-1, :].contiguous()
                 labels = labels[..., 1:].contiguous()
                 logits = logits.transpose(1, 2)
                 # Compute loss
                 loss = self._loss_fn(logits, labels)
 
+                # Compute early exit loss
+                if self._model.output_hidden_states:
+                    # TODO: calculate early_logits in one shot:
+                    # logits_early = self._model.output(self._model.norm(torch.stack(tuple(self._model.output_hidden_states.values()))))
+                    for layer_id, hidden_state in self._model.output_hidden_states.items():
+                        h_early = self._model.norm(hidden_state)
+                        logits_early = self._model.output(h_early)
+                        # Shift so that tokens < n predict n
+                        logits_early = logits_early[..., :-1, :].contiguous()
+                        logits_early = logits_early.transpose(1, 2)
+                        # Compute early loss
+                        loss_early = self._loss_fn(logits_early, labels)
+                        loss += 0.1 / len(self._model.layers) * loss_early
+
                 loss = loss / self._gradient_accumulation_steps
                 running_loss += loss
                 loss.backward()

diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from .attention import CausalSelfAttention  # noqa
+from .layer_dropout import LayerDropout, create_layer_dropout_modules # noqa
 from .common_utils import reparametrize_as_dtype_state_dict_post_hook
 from .feed_forward import FeedForward  # noqa
 from .kv_cache import KVCache  # noqa
@@ -24,4 +25,6 @@
     "TransformerDecoderLayer",
     "TransformerClassifier",
     "reparametrize_as_dtype_state_dict_post_hook",
+    "LayerDropout",
+    "create_layer_dropout_modules",
 ]
diff --git a/torchtune/modules/common_utils.py b/torchtune/modules/common_utils.py
@@ -48,3 +48,30 @@ def reparametrize_as_dtype_state_dict_post_hook(
             state_dict[k] = v.to(dtype)
             if offload_to_cpu:
                 state_dict[k] = state_dict[k].cpu()
+
+def slice_str_to_array(slice_str, length):
+    # Parse the slice string
+    parts = slice_str.split(':')
+    start, end, step = None, None, None
+
+    if len(parts) == 1 and parts[0] != '':
+        start = int(parts[0])
+    elif len(parts) == 2:
+        start = int(parts[0]) if parts[0] != '' else None
+        end = int(parts[1]) if parts[1] != '' else None
+    elif len(parts) == 3:
+        start = int(parts[0]) if parts[0] != '' else None
+        end = int(parts[1]) if parts[1] != '' else None
+        step = int(parts[2]) if parts[2] != '' else None
+
+    # Create a boolean array based on the slice
+    result = [False] * length
+    slice_indices = range(start if start is not None else 0,
+                          end if end is not None else length,
+                          step if step is not None else 1)
+
+    for i in slice_indices:
+        if 0 <= i < length:
+            result[i] = True
+
+    return result
diff --git a/torchtune/modules/layer_dropout.py b/torchtune/modules/layer_dropout.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Callable, Optional
+import math
+import torch
+
+from .common_utils import slice_str_to_array
+
+class LayerDropout(torch.nn.Module):
+    def __init__(self, prob=0.0, dim=0, disable_on_eval=True, seed=None):
+        super().__init__()
+        self.prob: float = prob
+        self.dim = dim
+        self.disable_on_eval: bool = disable_on_eval
+        self.generator = torch.Generator(device="cpu")
+        self.inferred: float = None
+
+        if seed is not None:
+            self.generator.manual_seed(seed)
+
+    def forward(self, function: Callable, input: torch.Tensor, *args, **kwargs):
+        n = input.shape[self.dim]
+
+        if self.prob == 0 or (self.disable_on_eval and self.training is False):
+            self.inferred = 1.0
+            return function(input, *args, **kwargs)
+
+        skip = torch.bernoulli(torch.Tensor((n) * [self.prob]), generator=self.generator).to(input.device).to(input.dtype)
+        self.inferred = 1 - torch.mean(skip)
+        ind_selected = (skip == 0).nonzero().squeeze().to(input.device)
+
+        if ind_selected.numel() > 0:
+            x_selected = torch.index_select(input, self.dim, ind_selected)
+            out_selected = function(x_selected, *args, **kwargs)
+
+        out = input.clone()
+        assert self.dim == 0, "Currently only supporting dropping elements along the 0th dimension"
+        if ind_selected.numel() > 0:
+            out[ind_selected] = out_selected
+        return out
+
+class ScaleType(str, Enum):
+    UNIFORM = "uniform"
+    EXP = "exp"
+    LINEAR = "linear"
+    LOG = "log"
+    SIN = "sin"
+    SIGMOID = "sigmoid"
+    STEP = "step"
+
+def get_scale(scale_type: ScaleType, scale_period: int, val: int):
+    if scale_period == 0:
+        return 1
+
+    # all the equations below aim to make scale = 0 when val=0, and scale = 1 when val=scale_period
+    return {
+        ScaleType.UNIFORM: 1,
+        ScaleType.EXP: math.exp(val * math.log(2) / scale_period) - 1,
+        ScaleType.LINEAR: val / scale_period,
+        ScaleType.LOG: math.log(val + 1) / math.log(scale_period + 1),
+        ScaleType.SIN: math.sin(0.5 * math.pi * val / scale_period),
+        ScaleType.SIGMOID: 1 / (1 + math.exp(-10 * (val / scale_period - 0.5))),
+    }[scale_type]
+
+def create_layer_dropout_modules(num_layers: int, prob_max: float= 0.0, prob_layer_scale: ScaleType = ScaleType.EXP, layers_str: Optional[str] = None, disable_on_eval: bool = True):
+    layer_dropouts = torch.nn.ModuleList()
+    has_dropout = slice_str_to_array(layers_str, num_layers) if layers_str else [True] * num_layers
+
+    for layer_id in range(num_layers):
+        prob = prob_max * get_scale(
+            scale_type = prob_layer_scale,
+            scale_period = num_layers - 1,
+            val = layer_id,
+        ) if has_dropout[layer_id] else 0.0
+        assert prob >= 0.0 and prob <= prob_max, f"prob={prob} should be between 0 and {prob_max}"
+        # We would like each layer to have a different seed, so that we don't have the same samples skipped across layers. Hence, we use the layer_id as a seed for each layer's dropout.
+        layer_dropout = LayerDropout(prob, disable_on_eval=disable_on_eval, seed=layer_id)
+        layer_dropouts.append(layer_dropout)
+
+    return layer_dropouts
diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -4,12 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
-from typing import Optional
+from collections import OrderedDict
+from typing import List, Optional, Union
 
 import torch
 from torch import nn, Tensor
 
 from torchtune.modules import CausalSelfAttention, KVCache
+from torchtune.modules import LayerDropout, create_layer_dropout_modules
 
 
 class TransformerDecoderLayer(nn.Module):
@@ -121,6 +123,8 @@ class TransformerDecoder(nn.Module):
             before final MLP.
         output (nn.Linear): Callable that applies a linear transformation to the output of
             the decoder.
+        layer_dropout_prob (float): Probability of skipping samples in the transformer
+            layer.
 
     Note:
         Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1])
@@ -138,6 +142,9 @@ def __init__(
         head_dim: int,
         norm: nn.Module,
         output: nn.Linear,
+        layer_dropout_prob: float = 0.5,
+        layer_dropout_prob_layer_scale: str = "exp",
+        layer_dropout_str: str = ":",
     ) -> None:
         super().__init__()
 
@@ -150,6 +157,9 @@ def __init__(
         self.head_dim = head_dim
         self.causal_mask = None
 
+        self.layer_dropouts = create_layer_dropout_modules(num_layers, layer_dropout_prob, layer_dropout_prob_layer_scale, layer_dropout_str)
+        self.output_hidden_states = OrderedDict() # TODO: use tensordict?
+
     def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None:
         """Setup key value caches for attention calculation.
 
@@ -188,6 +198,7 @@ def forward(
         *,
         mask: Optional[Tensor] = None,
         input_pos: Optional[Tensor] = None,
+        output_hidden_states: Union[bool, List[bool]] = False,
     ) -> Tensor:
         """
         Args:
@@ -227,6 +238,9 @@ def forward(
         # shape: [b, s, d]
         h = self.tok_embeddings(tokens)
 
+        if isinstance(output_hidden_states, bool):
+            output_hidden_states = [output_hidden_states] * len(self.layers)
+
         if self.causal_mask is not None:
             if input_pos is None:
                 raise ValueError(
@@ -240,9 +254,11 @@ def forward(
             # in most cases input_pos_len should be 1
             mask = self.causal_mask[None, input_pos]
 
-        for layer in self.layers:
+        for i, layer in enumerate(self.layers):
             # shape: [b, s, d]
-            h = layer(h, mask=mask, input_pos=input_pos)
+            h = self.layer_dropouts[i](layer, h, mask=mask, input_pos=input_pos)
+            if output_hidden_states[i]:
+                self.output_hidden_states[i] = h
 
         # shape: [b, s, d]
         h = self.norm(h)