model fragments for diloco

tushar00jain · tushar00jain · commit 321a888b6aff · 2025-07-27T17:46:42.000-07:00
Summary: - add a configuration option for users to provide how they want to partition the model - if this is provided, the model needs to implement `FaultTolerantTrainingSpec` that defines the framentation function to split the model based on the configuration - determine the model fragments in training script to pass to ft manager Test Plan: Running llama3 8b parameters with 2 fragments, 1 step delay, each fragment gets synced every 20 steps <img width="944" height="545" alt="image" src="https://github.com/user-attachments/assets/6d16f486-7260-49d6-8ba3-3e98cd331e58" />
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -661,6 +661,18 @@ class FaultTolerance:
     This is only used when "semi_sync_method" is set.
     """
 
+    module_names_per_model_chunk: list[list[str]] = field(default_factory=list)
+    """
+    Specify a list of lists containing the FQNs (Fully Qualified Names) of modules for each model chunk.
+    Each inner list represents one model chunk and contains the module names that belong to that chunk.
+    e.g. [['tok_embeddings', 'layers.0'], ['layers.1', 'layers.2'], ['layers.3', 'layers.4']]
+    will create 3 chunks: the first containing tok_embeddings and layers.0,
+    the second containing layers.1 and layers.2, and the third containing layers.3 and layers.4.
+    This provides more explicit control over which modules belong to each chunk compared to split points.
+    """
+
+    num_fragments: int = 1
+
 
 @dataclass
 class Experimental:
diff --git a/torchtitan/distributed/pipeline.py b/torchtitan/distributed/pipeline.py
@@ -6,6 +6,8 @@
 import os
 from typing import Callable
 
+from torch import nn
+
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
     _PipelineScheduleRuntime,
@@ -19,7 +21,13 @@
 from torchtitan.tools.logging import logger
 
 
-__all__ = ["build_pipeline_schedule", "generate_split_points", "stage_ids_this_rank"]
+__all__ = [
+    "build_pipeline_schedule",
+    "generate_split_points",
+    "stage_ids_this_rank",
+    "generate_module_names_per_stage",
+    "module_split",
+]
 
 
 # TODO: It's unclear if this API is general enough to be used by other models.
@@ -206,6 +214,196 @@ def stage_ids_this_rank(
             stages_per_rank == 2
         ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
         stage_v_pairs = list(
-            zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1))
+            zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1), strict=True)
         )
         return stage_v_pairs[pp_rank]
+
+
+def generate_module_names_per_stage(
+    num_stages: int,
+    num_layers: int,
+    input_weight: int = 1,
+    output_weight: int = 1,
+) -> list[list[str]]:
+    """
+    Programmatically generates module names per stage for pipeline parallelism with weighting.
+
+    Args:
+        num_stages: Number of pipeline stages
+        num_layers: Total number of transformer layers in the model
+        input_weight: Weight for input modules (tok_embeddings) in layer calculation
+        output_weight: Weight for output modules (norm + output) in layer calculation
+
+    Returns:
+        List of lists containing module names for each stage
+
+    Example:
+        generate_module_names_per_stage(2, 3, input_weight=2, output_weight=2)
+        treats embeddings as 2 layers and norm+output as 2 layers for distribution
+    """
+    if num_stages < 1:
+        raise ValueError("Number of stages must be at least 1")
+
+    if num_stages == 1:
+        # Single stage gets everything
+        layer_names = [f"layers.{i}" for i in range(num_layers)]
+        return [["tok_embeddings"] + layer_names + ["norm", "output"]]
+
+    # Calculate effective layers including weights
+    num_effective_layers = num_layers + input_weight + output_weight
+
+    if num_stages > num_effective_layers:
+        raise ValueError(
+            f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})"
+        )
+
+    # Calculate layers per stage (distribute evenly)
+    layers_per_stage = num_effective_layers // num_stages
+    extra_layers = num_effective_layers % num_stages
+
+    # Ensure each stage gets at least the weight of input/output modules
+    if layers_per_stage < max(input_weight, output_weight):
+        raise ValueError(
+            f"Layers per stage ({layers_per_stage}) must be >= max(input_weight={input_weight}, output_weight={output_weight})"
+        )
+
+    module_names_per_stage = []
+    current_layer = 0
+
+    for stage_idx in range(num_stages):
+        stage_modules = []
+
+        # Calculate effective layers for this stage
+        effective_layers_for_stage = layers_per_stage
+        if stage_idx < extra_layers:
+            effective_layers_for_stage += 1
+
+        # First stage: handle input modules with weighting
+        if stage_idx == 0:
+            stage_modules.append("tok_embeddings")
+            # Account for input weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - input_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+        # Last stage: handle output modules with weighting
+        elif stage_idx == num_stages - 1:
+            # Account for output weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - output_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+            # Add output modules
+            stage_modules.extend(["norm", "output"])
+
+        # Middle stages: only transformer layers
+        else:
+            for _ in range(effective_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+        module_names_per_stage.append(stage_modules)
+
+    return module_names_per_stage
+
+
+def module_split(
+    model: nn.Module,
+    module_names_per_stage: list[list[str]],
+) -> list[nn.Module]:
+    """
+    This API creates pipeline stages based on specified module names for each stage.
+    This method updates the model in place.
+
+    Args:
+        model: The complete model to be split
+        module_names_per_stage: List of lists, where each inner list contains the module names
+                               that should be included in that stage. Module names should be
+                               dot-separated paths. Examples:
+                               - "tok_embeddings" for token embeddings
+                               - "layers.0", "layers.1" for specific transformer layers
+                               - "norm" for the final normalization layer
+                               - "output" for the output projection layer
+
+    Returns:
+        List of model chunks
+
+    Example usage:
+        module_names_per_stage = [
+            ["tok_embeddings", "layers.0"],     # Stage 0: embeddings + first layer
+            ["layers.1", "layers.2"],           # Stage 1: middle layers
+            ["norm", "output"]                  # Stage 2: final norm + output
+        ]
+    """
+
+    def _build_stage_from_modules(stage_idx: int, module_names: list[str]) -> nn.Module:
+        stage_model = nn.Module()
+        # Create a set of modules to keep for faster lookup
+        modules_to_keep = set(module_names)
+        print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}")
+        for module_name, module_value in model.named_children():
+            # Handle layer-like structures (e.g., "layers.0", "layers.1")
+            if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
+                layers_to_keep = {
+                    name.split(".", 1)[1]
+                    for name in modules_to_keep
+                    if name.startswith(f"{module_name}.")
+                }
+
+                if not layers_to_keep:
+                    continue
+
+                # Keep only specified layers
+                if isinstance(module_value, nn.ModuleDict):
+                    for layer_name in list(module_value.keys()):
+                        if layer_name in layers_to_keep:
+                            setattr(
+                                stage_model,
+                                f"{module_name}.{layer_name}",
+                                module_value[layer_name],
+                            )
+                else:
+                    indices_to_keep = {
+                        int(idx) for idx in layers_to_keep if idx.isdigit()
+                    }
+                    new_layers = nn.ModuleList(
+                        [
+                            layer
+                            for i, layer in enumerate(module_value)
+                            if i in indices_to_keep
+                        ]
+                    )
+                    setattr(stage_model, module_name, new_layers)
+
+                continue
+
+            # Handle simple module attributes (e.g., "linear", "norm")
+            if module_name not in modules_to_keep:
+                continue
+
+            setattr(stage_model, module_name, module_value)
+
+        return stage_model
+
+    num_stages = len(module_names_per_stage)
+    models = []
+
+    for stage_idx in range(num_stages):
+        module_names = module_names_per_stage[stage_idx]
+        model_chunk = _build_stage_from_modules(
+            stage_idx,
+            module_names,
+        )
+        logger.info(f"building stage_idx {stage_idx} " f"with modules {module_names}")
+        models.append(model_chunk)
+
+    return models
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -10,7 +10,8 @@
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.components.validate import build_validator
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from torchtitan.protocols.train_spec import FaultTolerantTrainSpec, register_train_spec
+from .infra.fault_tolerance import fragment_llama
 
 from .infra.parallelize import parallelize_llama
 from .infra.pipeline import pipeline_llama
@@ -71,12 +72,13 @@
 
 
 register_train_spec(
-    TrainSpec(
+    FaultTolerantTrainSpec(
         name="llama3",
         model_cls=Transformer,
         model_args=llama3_configs,
         parallelize_fn=parallelize_llama,
         pipelining_fn=pipeline_llama,
+        fragment_fn=fragment_llama,
         build_optimizers_fn=build_optimizers,
         build_lr_schedulers_fn=build_lr_schedulers,
         build_dataloader_fn=build_hf_dataloader,
diff --git a/torchtitan/models/llama3/infra/fault_tolerance.py b/torchtitan/models/llama3/infra/fault_tolerance.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file is used to setup the model for fault tolerance
+
+import torch.nn as nn
+
+from torchtitan.config import JobConfig
+from torchtitan.distributed.pipeline import (
+    generate_module_names_per_stage,
+    module_split,
+)
+
+from ..model.args import TransformerModelArgs
+
+
+def fragment_llama(
+    model: nn.Module,
+    job_config: JobConfig,
+    model_config: TransformerModelArgs,
+) -> list[nn.Module]:
+    ft = job_config.fault_tolerance
+
+    assert ft.num_fragments > 0
+
+    module_names_per_stage = ft.module_names_per_model_chunk
+
+    input_weight = 1  # Weight for tok_embeddings
+    output_weight = 1  # Weight for norm + output layers
+
+    if module_names_per_stage == []:
+        if ft.num_fragments == 1:
+            return [model]
+
+        module_names_per_stage = generate_module_names_per_stage(
+            ft.num_fragments, model_config.n_layers, input_weight, output_weight
+        )
+
+    model_fragments = module_split(model, module_names_per_stage)
+    print(f"Created {len(model_fragments)} model fragments")
+
+    return model_fragments
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
@@ -56,6 +56,14 @@ class TrainSpec:
     state_dict_adapter: type[StateDictAdapter] | None = None
 
 
+FragmentFunction: TypeAlias = Callable[..., list[nn.Module]]
+
+
+@dataclass
+class FaultTolerantTrainSpec(TrainSpec):
+    fragment_fn: FragmentFunction | None = None
+
+
 _train_specs = {}
 
 
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -8,7 +8,7 @@
 import os
 import time
 from datetime import timedelta
-from typing import Any, Generator, Iterable, Optional
+from typing import Any, cast, Generator, Iterable, Optional
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
@@ -43,6 +43,7 @@ class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     tokenizer: train_spec_module.BaseTokenizer | None
     dataloader: train_spec_module.BaseDataLoader
     model_parts: list[torch.nn.Module]
+    ft_model_parts: list[torch.nn.Module]
     loss_fn: train_spec_module.LossFunction
     optimizers: train_spec_module.OptimizersContainer
     lr_schedulers: train_spec_module.LRSchedulersContainer
@@ -215,6 +216,8 @@ def __init__(self, job_config: JobConfig):
             self.loss_fn, self.gradient_accumulation_steps
         )
 
+        self.ft_model_parts = []
+
         # apply parallelisms and initialization
         if parallel_dims.pp_enabled:
             if not self.train_spec.pipelining_fn:
@@ -261,6 +264,19 @@ def __init__(self, job_config: JobConfig):
 
             self.model_parts = [model]
 
+            ft = job_config.fault_tolerance
+
+            if ft.enable:
+                train_spec = cast(
+                    train_spec_module.FaultTolerantTrainSpec, self.train_spec
+                )
+                if train_spec.fragment_fn:
+                    self.ft_model_parts = train_spec.fragment_fn(
+                        model, job_config, model_args
+                    )
+                else:
+                    self.ft_model_parts = [model]
+
         self.ft_manager.maybe_set_all_reduce_hook(self.model_parts)
 
         # initialize device memory monitor and get peak flops for MFU calculation
@@ -524,7 +540,7 @@ def train(self):
             maybe_semi_sync_training(
                 job_config.fault_tolerance,
                 ft_manager=self.ft_manager,
-                model_parts=self.model_parts,
+                model_parts=self.ft_model_parts,
                 optimizer=self.optimizers,
             ),
         ):