pytorch
diff --git a/‎torchtitan/components/quantization/float8.py‎
Lines changed: 1 addition & 3 deletions b/‎torchtitan/components/quantization/float8.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎torchtitan/components/quantization/mx.py‎
Lines changed: 2 additions & 5 deletions b/‎torchtitan/components/quantization/mx.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎torchtitan/experiments/llama4/infra/expert_parallel.py‎ renamed to ‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 36 additions & 1 deletion b/‎torchtitan/experiments/llama4/infra/expert_parallel.py‎ renamed to ‎torchtitan/distributed/expert_parallel.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎torchtitan/distributed/pipeline.py‎ renamed to ‎torchtitan/distributed/pipeline_parallel.py‎ b/‎torchtitan/distributed/pipeline.py‎ renamed to ‎torchtitan/distributed/pipeline_parallel.py‎
diff --git a/‎torchtitan/experiments/llama4/__init__.py‎
Lines changed: 5 additions & 4 deletions b/‎torchtitan/experiments/llama4/__init__.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎torchtitan/experiments/llama4/infra/parallelize.py‎
Lines changed: 4 additions & 4 deletions b/‎torchtitan/experiments/llama4/infra/parallelize.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torchtitan/experiments/llama4/model/args.py‎
Lines changed: 8 additions & 13 deletions b/‎torchtitan/experiments/llama4/model/args.py‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎torchtitan/experiments/llama4/model/model.py‎
Lines changed: 19 additions & 3 deletions b/‎torchtitan/experiments/llama4/model/model.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎torchtitan/experiments/llama4/train_configs/debug_model.toml‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/llama4/train_configs/debug_model.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 43 additions & 21 deletions b/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 43 additions & 21 deletions
@@ -10,9 +10,7 @@
 
 from torchtitan.config.job_config import Float8, JobConfig
 from torchtitan.distributed import ParallelDims
-from torchtitan.experiments.llama4.infra.expert_parallel import (
-    set_token_group_alignment_size_m,
-)
+from torchtitan.distributed.expert_parallel import set_token_group_alignment_size_m
 from torchtitan.protocols.model_converter import (
     ModelConverter,
     register_model_converter,
 
@@ -13,6 +13,7 @@
 
 from torchtitan.config.job_config import JobConfig, MX
 from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.expert_parallel import set_token_group_alignment_size_m
 from torchtitan.protocols.model_converter import (
     ModelConverter,
     register_model_converter,
@@ -58,12 +59,8 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
 
         # For MoE training with mxfp8, token group sizes must be multiples of 32
         if job_config.mx.moe_fqns_prototype:
-            from torchtitan.experiments.llama4.infra.expert_parallel import (
-                set_token_group_alignment_size,
-            )
-
             mxfp8_block_size = 32
-            set_token_group_alignment_size(mxfp8_block_size)
+            set_token_group_alignment_size_m(mxfp8_block_size)
             logger.info(f"Setting token group alignment size to {mxfp8_block_size}")
 
         # Configure MXFP8
 
@@ -11,7 +11,8 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._functional_collectives import all_to_all_single_autograd
+
+# from torch.distributed._functional_collectives import all_to_all_single_autograd
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
@@ -24,6 +25,40 @@
 from torch.distributed.tensor.placement_types import Placement
 
 
+# TODO: there is memory leak issue with AC + PT-D all_to_all_single_autograd
+# This is a temporary fix by @rakkit https://github.com/pytorch/torchtitan/issues/1467
+class _A2A(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, out_splits, in_splits, group):
+        if isinstance(out_splits, torch.Tensor):
+            out_splits = out_splits.tolist()
+        if isinstance(in_splits, torch.Tensor):
+            in_splits = in_splits.tolist()
+        T_out = int(sum(out_splits))
+
+        y = x.new_empty((T_out,) + tuple(x.shape[1:]))  # allocate by output splits
+        dist.all_to_all_single(y, x.contiguous(), out_splits, in_splits, group=group)
+
+        ctx.in_splits = in_splits
+        ctx.out_splits = out_splits
+        ctx.group = group
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_y):
+        # grad wrt input has length sum(in_splits)
+        T_in = int(sum(ctx.in_splits))
+        grad_x = grad_y.new_empty((T_in,) + tuple(grad_y.shape[1:]))
+        dist.all_to_all_single(
+            grad_x, grad_y.contiguous(), ctx.in_splits, ctx.out_splits, group=ctx.group
+        )
+        return grad_x, None, None, None
+
+
+def all_to_all_single_autograd(x, out_splits, in_splits, group):
+    return _A2A.apply(x, out_splits, in_splits, group)
+
+
 TOKEN_GROUP_ALIGN_SIZE_M = 8
 ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
 
 
@@ -9,6 +9,7 @@
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
 from torchtitan.models.llama3 import pipeline_llama
+from torchtitan.models.moe import MoEArgs
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize import parallelize_llama
@@ -40,7 +41,7 @@
         multiple_of=2048,
         rope_theta=500000,
         max_seq_len=10485760,
-        num_experts=16,
+        moe_args=MoEArgs(num_experts=16),
         interleave_moe_layer_step=1,
     ),
     "17bx128e": TransformerModelArgs(
@@ -51,7 +52,7 @@
         ffn_dim_multiplier=1.2,
         multiple_of=2048,
         rope_theta=500000,
-        num_experts=128,
+        moe_args=MoEArgs(num_experts=128),
     ),
     "debugmodel_irope": TransformerModelArgs(
         dim=256,
@@ -73,7 +74,7 @@
         multiple_of=2048,
         rope_theta=500000,
         max_seq_len=10485760,
-        num_experts=16,
+        moe_args=MoEArgs(num_experts=16),
         interleave_moe_layer_step=1,
         every_n_layers_nope=4,
         use_flex_attn=True,
@@ -87,7 +88,7 @@
         ffn_dim_multiplier=1.2,
         multiple_of=2048,
         rope_theta=500000,
-        num_experts=128,
+        moe_args=MoEArgs(num_experts=128),
         every_n_layers_nope=4,
         use_flex_attn=True,
         attn_mask_type="block_causal",
 
@@ -21,16 +21,16 @@
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
 
-from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
-from torchtitan.tools.logging import logger
-
-from .expert_parallel import (
+from torchtitan.distributed.expert_parallel import (
     ExpertParallel,
     ExpertTensorParallel,
     NoParallel,
     TensorParallel,
 )
 
+from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
+from torchtitan.tools.logging import logger
+
 
 def parallelize_llama(
     model: nn.Module,
 
@@ -5,11 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from torch import nn
 
 from torchtitan.config import JobConfig
+
+from torchtitan.models.moe import MoEArgs
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import has_cuda_capability
@@ -34,7 +36,6 @@ class TransformerModelArgs(BaseModelArgs):
 
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
-    eos_id: int = 0
     # iRoPE settings
     # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is
     # used every n layers. Other layers uses RoPE (rotary positional embedding) and
@@ -45,17 +46,11 @@ class TransformerModelArgs(BaseModelArgs):
     every_n_layers_nope: int | None = None
     fixed_attn_block_size: int = 8192
 
-    # MoE args
-    moe_enabled: bool = True
-    num_experts: int = 8
-    use_shared_expert: bool = True
+    # MoE
+    moe_args: MoEArgs = field(default_factory=MoEArgs)
     auto_scale_hidden_dim: bool = True
     # frequency of using MoE layer instead of feedforward layer in a transformer block
     interleave_moe_layer_step: int = 2
-    # token-choice
-    top_k: int = 1
-    use_grouped_mm: bool = True  # grouped mm or for-loop for the experts computation
-    load_balance_coeff: float | None = 1e-3
 
     def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         seq_len = job_config.training.seq_len
@@ -65,11 +60,11 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
             )
         self.max_seq_len = seq_len
 
-        if self.use_grouped_mm and not has_cuda_capability(9, 0):
+        if self.moe_args.use_grouped_mm and not has_cuda_capability(9, 0):
             logger.warning(
                 "Failed to use grouped mm, which is only supported on SM90 or later",
             )
-            self.use_grouped_mm = False
+            self.moe_args.use_grouped_mm = False
 
         if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
             raise NotImplementedError(
@@ -112,7 +107,7 @@ def get_nparams_and_flops(
         nparams_sparse_active = (
             nparams_moe_router
             + nparams_shared_expert
-            + nparams_experts * self.top_k // self.num_experts
+            + nparams_experts * self.moe_args.top_k // self.moe_args.num_experts
         )
 
         logger.info(
 
@@ -10,10 +10,10 @@
 from torch import nn
 
 from torchtitan.models.attention import build_attention, init_attention_mask
+from torchtitan.models.moe import MoE
 from torchtitan.protocols import ModelProtocol
 
 from .args import TransformerModelArgs
-from .moe import MoE
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
@@ -296,12 +296,28 @@ def __init__(
         self.attention = Attention(model_args, attn_use_rope, fixed_attn_block_size)
 
         # use MoE layer for every interleave_moe_layer_step FFN layers
+        moe_args = model_args.moe_args
         self.moe_enabled = (
-            model_args.moe_enabled
+            moe_args.moe_enabled
             and (layer_id + 1) % model_args.interleave_moe_layer_step == 0
         )
         if self.moe_enabled:
-            self.moe = MoE(model_args)
+            dim = model_args.dim
+            hidden_dim = 4 * model_args.dim
+            ffn_dim_multiplier = model_args.ffn_dim_multiplier
+            hidden_dim = int(2 * hidden_dim / 3)
+            if ffn_dim_multiplier is not None:
+                hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+
+            hidden_dim_denom = 1
+            if model_args.auto_scale_hidden_dim:
+                hidden_dim_denom = moe_args.top_k + moe_args.num_shared_experts
+
+            if model_args.auto_scale_hidden_dim:
+                hidden_dim = int(hidden_dim / hidden_dim_denom)
+            hidden_dim += -hidden_dim % model_args.multiple_of
+
+            self.moe = MoE(moe_args, dim=dim, hidden_dim=hidden_dim)
         else:
             self.feed_forward = FeedForward(
                 dim=model_args.dim,
 
@@ -63,7 +63,7 @@ export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
 
 [activation_checkpoint]
-mode = "none"  # ["none", "selective", "full"]
+mode = "selective"  # ["none", "selective", "full"]
 selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [float8]
 
@@ -12,6 +12,7 @@
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
 from torchtitan.experiments.llama4.optimizer import build_llama4_optimizers
 from torchtitan.models.llama3.infra.pipeline import pipeline_llama
+from torchtitan.models.moe import MoEArgs
 
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
@@ -36,10 +37,14 @@
         n_layers=3,
         n_dense_layers=1,
         n_heads=16,
-        n_routed_experts=8,
-        n_shared_experts=2,
-        n_activated_experts=3,
-        route_scale=1.0,
+        moe_args=MoEArgs(
+            num_experts=8,
+            num_shared_experts=2,
+            top_k=3,
+            score_func="softmax",
+            route_norm=True,
+            score_before_experts=False,
+        ),
         q_lora_rank=0,
         kv_lora_rank=512,
         qk_nope_head_dim=128,
@@ -55,10 +60,14 @@
         n_layers=3,
         n_dense_layers=1,
         n_heads=16,
-        n_routed_experts=8,
-        n_shared_experts=2,
-        n_activated_experts=3,
-        route_scale=1.0,
+        moe_args=MoEArgs(
+            num_experts=8,
+            num_shared_experts=2,
+            top_k=3,
+            score_func="softmax",
+            route_norm=True,
+            score_before_experts=False,
+        ),
         q_lora_rank=0,
         kv_lora_rank=512,
         qk_nope_head_dim=128,
@@ -76,10 +85,14 @@
         n_layers=27,
         n_dense_layers=1,
         n_heads=16,
-        n_routed_experts=64,
-        n_shared_experts=2,
-        n_activated_experts=6,
-        route_scale=1.0,
+        moe_args=MoEArgs(
+            num_experts=64,
+            num_shared_experts=2,
+            top_k=6,
+            score_func="softmax",
+            route_norm=True,
+            score_before_experts=False,
+        ),
         q_lora_rank=0,
         kv_lora_rank=512,
         qk_nope_head_dim=128,
@@ -95,12 +108,17 @@
         n_layers=60,
         n_dense_layers=1,
         n_heads=128,
-        n_routed_experts=160,
-        n_shared_experts=2,
-        n_activated_experts=6,
+        moe_args=MoEArgs(
+            num_experts=160,
+            num_shared_experts=2,
+            top_k=6,
+            score_func="softmax",
+            route_norm=True,
+            route_scale=16.0,
+            score_before_experts=False,
+        ),
         n_expert_groups=8,
         n_limited_groups=3,
-        route_scale=16.0,
         q_lora_rank=1536,
         kv_lora_rank=512,
         qk_nope_head_dim=128,
@@ -115,13 +133,17 @@
         n_layers=61,
         n_dense_layers=3,
         n_heads=128,
-        n_routed_experts=256,
-        n_shared_experts=1,
-        n_activated_experts=8,
+        moe_args=MoEArgs(
+            num_experts=256,
+            num_shared_experts=1,
+            top_k=8,
+            score_func="sigmoid",
+            route_norm=True,
+            route_scale=2.5,
+            score_before_experts=False,
+        ),
         n_expert_groups=8,
         n_limited_groups=4,
-        route_scale=2.5,
-        score_func="sigmoid",
         q_lora_rank=1536,
         kv_lora_rank=512,
         qk_nope_head_dim=128,