test updates

danielvegamyhre · danielvegamyhre · commit 5fa7db202000 · 2025-08-29T13:03:39.000-07:00
diff --git a/test/prototype/moe_training/test_fsdp.py b/test/prototype/moe_training/test_fsdp.py
@@ -26,6 +26,7 @@
 from torch import distributed as dist
 from torch import nn
 from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.nn import functional as F
 
 # this feature requires CUDA and SM89+
@@ -53,6 +54,26 @@
     )
 
 
+@pytest.fixture(scope="module")
+def device_mesh_1d() -> DeviceMesh:
+    """
+    Fixture for setting up and tearing down the distributed environment
+    for the entire test module.
+    """
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    if not dist.is_initialized():
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    device_mesh = init_device_mesh("cuda", (world_size,))
+    torch.manual_seed(1)
+    torch.cuda.set_device(rank)
+
+    yield device_mesh
+
+    dist.destroy_process_group()
+
+
 @pytest.mark.parametrize(
     "target_fqns",
     [
@@ -80,7 +101,12 @@
         },
     ],
 )
-def test_moe_training_fsdp(target_fqns: list[str], compile: bool, recipe_config: dict):
+def test_moe_training_fsdp(
+    target_fqns: list[str],
+    compile: bool,
+    recipe_config: dict,
+    device_mesh_1d: DeviceMesh,
+):
     (
         recipe,
         group_alignment_size,
@@ -111,9 +137,6 @@ def test_moe_training_fsdp(target_fqns: list[str], compile: bool, recipe_config:
             f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
         )
 
-    # setup distributed for fsdp
-    setup_distributed()
-
     # set token group alignment size needed for GEMM (contraction dim stride must be 16 byte aligned)
     # or quantization ops (mxfp8 scaling groups are size 1x32)
     set_token_group_alignment_size_m(group_alignment_size)
@@ -154,6 +177,10 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         model,
         target_fqns=target_fqns,
     )
+    if compile:
+        # TODO: compile with fullgraph=True when torchtitan llama4 moe supports it
+        model = torch.compile(model, fullgraph=False)
+        ref_model = torch.compile(ref_model, fullgraph=False)
 
     # FSDP2
     fully_shard(model)
@@ -197,12 +224,3 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         assert param_grad_sqnr.item() >= min_param_grad_sqnr, (
             f"SQNR must be >= {min_param_grad_sqnr}, got {param_grad_sqnr.item()}."
         )
-
-    dist.destroy_process_group()
-
-
-def setup_distributed():
-    rank = int(os.environ["RANK"])
-    world_size = int(os.environ["WORLD_SIZE"])
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-    torch.cuda.set_device(rank)
diff --git a/test/prototype/moe_training/test_fsdp_tp.py b/test/prototype/moe_training/test_fsdp_tp.py
@@ -74,6 +74,31 @@
     )
 
 
+@pytest.fixture(scope="module")
+def device_mesh_2d() -> DeviceMesh:
+    """
+    Fixture for setting up and tearing down the distributed environment
+    for the entire test module.
+    """
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    if not dist.is_initialized():
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    device_mesh = init_device_mesh(
+        "cuda",
+        (world_size // 2, 2),
+        mesh_dim_names=("dp", "tp"),
+    )
+
+    torch.manual_seed(1)
+    torch.cuda.set_device(rank)
+
+    yield device_mesh
+
+    dist.destroy_process_group()
+
+
 @pytest.mark.parametrize(
     "target_fqns",
     [
@@ -102,7 +127,10 @@
     ],
 )
 def test_moe_training_fsdp_tp(
-    target_fqns: list[str], compile: bool, recipe_config: dict
+    target_fqns: list[str],
+    compile: bool,
+    recipe_config: dict,
+    device_mesh_2d: DeviceMesh,
 ):
     (
         recipe,
@@ -138,9 +166,6 @@ def test_moe_training_fsdp_tp(
     # or quantization ops (mxfp8 scaling groups are size 1x32)
     set_token_group_alignment_size_m(group_alignment_size)
 
-    # setup device mesh for fsdp + tp
-    mesh = setup_distributed()
-
     # define model args
     model_args = MoEArgs(
         num_experts=8,
@@ -177,13 +202,19 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         model,
         target_fqns=target_fqns,
     )
+    if compile:
+        # TODO: compile with fullgraph=True when torchtitan llama4 moe supports it
+        model = torch.compile(model, fullgraph=False)
+        ref_model = torch.compile(ref_model, fullgraph=False)
 
     # apply TP
-    apply_moe_ep_tp(model, tp_mesh=mesh["tp"], ep_mesh=None, ep_tp_mesh=None)
-    apply_moe_ep_tp(ref_model, tp_mesh=mesh["tp"], ep_mesh=None, ep_tp_mesh=None)
+    apply_moe_ep_tp(model, tp_mesh=device_mesh_2d["tp"], ep_mesh=None, ep_tp_mesh=None)
+    apply_moe_ep_tp(
+        ref_model, tp_mesh=device_mesh_2d["tp"], ep_mesh=None, ep_tp_mesh=None
+    )
 
     # apply FSDP2
-    fsdp_config = {"mesh": mesh["dp"]}
+    fsdp_config = {"mesh": device_mesh_2d["dp"]}
     fully_shard(model, **fsdp_config)
     fully_shard(ref_model, **fsdp_config)
 
@@ -246,26 +277,6 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
             f"SQNR must be >= {min_param_grad_sqnr}, got {param_grad_sqnr.item()}."
         )
 
-    dist.destroy_process_group()
-
-
-def setup_distributed():
-    rank = int(os.environ["RANK"])
-    world_size = int(os.environ["WORLD_SIZE"])
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    # https://pytorch.org/tutorials/recipes/distributed_device_mesh.html
-    device_mesh = init_device_mesh(
-        "cuda",
-        (world_size // 2, 2),
-        mesh_dim_names=("dp", "tp"),
-    )
-
-    # seed must be the same in all processes
-    torch.manual_seed(1)
-    torch.cuda.set_device(rank)
-    return device_mesh
-
 
 def apply_moe_ep_tp(
     model: nn.Module,
diff --git a/test/prototype/moe_training/test_tp.py b/test/prototype/moe_training/test_tp.py
@@ -72,6 +72,26 @@
     )
 
 
+@pytest.fixture(scope="module")
+def device_mesh_1d() -> DeviceMesh:
+    """
+    Fixture for setting up and tearing down the distributed environment
+    for the entire test module.
+    """
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    if not dist.is_initialized():
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    device_mesh = init_device_mesh("cuda", (world_size,))
+    torch.manual_seed(1)
+    torch.cuda.set_device(rank)
+
+    yield device_mesh
+
+    dist.destroy_process_group()
+
+
 @pytest.mark.parametrize(
     "target_fqns",
     [
@@ -99,7 +119,12 @@
         },
     ],
 )
-def test_moe_training_tp(target_fqns: list[str], compile: bool, recipe_config: dict):
+def test_moe_training_tp(
+    target_fqns: list[str],
+    compile: bool,
+    recipe_config: dict,
+    device_mesh_1d: DeviceMesh,
+):
     (
         recipe,
         group_alignment_size,
@@ -134,9 +159,6 @@ def test_moe_training_tp(target_fqns: list[str], compile: bool, recipe_config: d
     # or quantization ops (mxfp8 scaling groups are size 1x32)
     set_token_group_alignment_size_m(group_alignment_size)
 
-    # setup device mesh for fsdp + tp
-    mesh = setup_distributed()
-
     # define model args
     model_args = MoEArgs(
         num_experts=8,
@@ -178,10 +200,14 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         model,
         target_fqns=target_fqns,
     )
+    if compile:
+        # TODO: compile with fullgraph=True when torchtitan llama4 moe supports it
+        model = torch.compile(model, fullgraph=False)
+        ref_model = torch.compile(ref_model, fullgraph=False)
 
     # apply TP
-    apply_moe_ep_tp(model, tp_mesh=mesh, ep_mesh=None, ep_tp_mesh=None)
-    apply_moe_ep_tp(ref_model, tp_mesh=mesh, ep_mesh=None, ep_tp_mesh=None)
+    apply_moe_ep_tp(model, tp_mesh=device_mesh_1d, ep_mesh=None, ep_tp_mesh=None)
+    apply_moe_ep_tp(ref_model, tp_mesh=device_mesh_1d, ep_mesh=None, ep_tp_mesh=None)
 
     # Rough validation that parallelization was applied properly.
     assert isinstance(model.experts.w1.data, DTensor), (
@@ -242,19 +268,6 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
             f"SQNR must be >= {min_param_grad_sqnr}, got {param_grad_sqnr.item()}."
         )
 
-    dist.destroy_process_group()
-
-
-def setup_distributed():
-    rank = int(os.environ["RANK"])
-    world_size = int(os.environ["WORLD_SIZE"])
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-    device_mesh = init_device_mesh("cuda", (world_size,))
-    # seed must be the same in all processes
-    torch.manual_seed(1)
-    torch.cuda.set_device(rank)
-    return device_mesh
-
 
 def apply_moe_ep_tp(
     model: nn.Module,
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -73,6 +73,23 @@ def test_moe_training(target_fqns: list[str], compile: bool, recipe_config: dict
         recipe_config["min_input_grad_sqnr"],
         recipe_config["min_param_grad_sqnr"],
     )
+    assert torch.cuda.is_available()
+    if recipe == MoEScalingType.FP8_ROWWISE and torch.cuda.get_device_capability() != (
+        9,
+        0,
+    ):
+        pytest.skip(
+            f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+
+    elif recipe == MoEScalingType.MXFP8 and torch.cuda.get_device_capability() != (
+        10,
+        0,
+    ):
+        pytest.skip(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
+
     # Set token group alignment size. This is required so that
     # each logically distinct gemm in the grouped gemm `grad_weight = grad_output_t @ input`
     # has the contraction dim be divisible by 16. 16 byte alignment is required