Remove create_cp_block_mask (#1798)

fegin · web-flow · commit 96149f643774 · 2025-10-03T23:50:32.000-07:00
This code logic was committed prematurely as the final CP UX may be
different. Remove it for now to avoid confusion and future BC issues.
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -6,12 +6,10 @@
 #
 # Copyright (c) Meta Platforms, Inc. All Rights Reserved.
 
-import functools
 from typing import Callable, ClassVar
 
 import torch
 import torch.nn.functional as F
-from torch.distributed.tensor.experimental._attention import create_cp_block_mask
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
@@ -241,18 +239,5 @@ def build_attention(
         return ScaledDotProductAttention(attn_mask_type)
 
 
-def init_attention_mask(
-    batch: torch.Tensor,
-    eos_id: int | None,
-    cp_mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
-) -> None:
-
-    # This is not functional yet because we currently gate the use of Flex + CP
-    # while we continue debugging accuracy issues. However, we want to evaluate
-    # the user experience with CP enabled.
-    if cp_mesh is not None:
-        FlexAttention.compiled_create_block_mask = functools.partial(
-            create_cp_block_mask, device_mesh=cp_mesh
-        )
-
+def init_attention_mask(batch: torch.Tensor, eos_id: int | None) -> None:
     FlexAttention.init_attention_mask(batch, eos_id)
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -416,10 +416,7 @@ def forward_backward_step(
         extra_inputs = {k: v for k, v in input_dict.items() if k != "input"}
         # Create the FlexAttention mask according to the input
         if getattr(self.model_args, "use_flex_attn", False):
-            cp_mesh = (
-                parallel_dims.world_mesh["cp"] if parallel_dims.cp_enabled else None
-            )
-            init_attention_mask(inputs, self.tokenizer.eos_id, cp_mesh)
+            init_attention_mask(inputs, self.tokenizer.eos_id)
 
         # apply context parallelism if cp is enabled
         # ensure CP handles the separate freqs_cis buffer for each pp stage