From 45fb201a634b1ea06d825655f09483c8a571a488 Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Fri, 22 Aug 2025 15:20:48 -0700
Subject: [PATCH] [moe training] add test case for shared expert in distributed
 tests

stack-info: PR: https://github.com/pytorch/ao/pull/2856, branch: danielvegamyhre/stack/56
---
 test/prototype/moe_training/test_everything.sh |  6 +++---
 test/prototype/moe_training/test_fsdp.py       | 12 +++++++++---
 test/prototype/moe_training/test_fsdp.sh       |  2 +-
 test/prototype/moe_training/test_fsdp_tp.py    |  5 ++---
 test/prototype/moe_training/test_fsdp_tp.sh    |  2 +-
 test/prototype/moe_training/test_tp.py         |  5 ++---
 test/prototype/moe_training/test_tp.sh         |  2 +-
 7 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/test/prototype/moe_training/test_everything.sh b/test/prototype/moe_training/test_everything.sh
index 1a036cb7ea..d9164d4981 100755
--- a/test/prototype/moe_training/test_everything.sh
+++ b/test/prototype/moe_training/test_everything.sh
@@ -12,9 +12,9 @@ IS_ROCM=$(rocm-smi --version || true)
 # These tests do not work on ROCm yet
 if [ -z "$IS_ROCM" ]
 then
-./test/prototype/moe_training/test_fsdp.sh
-./test/prototype/moe_training/test_tp.sh
-./test/prototype/moe_training/test_fsdp_tp.sh
+./test_fsdp.sh
+./test_tp.sh
+./test_fsdp_tp.sh
 fi
 
 echo "all tests successful"
diff --git a/test/prototype/moe_training/test_fsdp.py b/test/prototype/moe_training/test_fsdp.py
index b205675527..b211a337a3 100644
--- a/test/prototype/moe_training/test_fsdp.py
+++ b/test/prototype/moe_training/test_fsdp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py
 #
 #######################################################################
 
@@ -45,7 +45,14 @@
     )
 
 
-def test_moe_float8_training_fsdp():
+@pytest.mark.parametrize(
+    "target_fqns",
+    [
+        ["experts"],
+        ["experts,shared_expert"],
+    ],
+)
+def test_moe_float8_training_fsdp(target_fqns: list[str]):
     assert torch.cuda.is_available()
 
     # setup distributed for fsdp
@@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp():
     set_token_group_alignment_size_m(16)
 
     # define model args
-    target_fqns = ["experts"]
     model_args = MoEArgs(
         num_experts=8,
     )
diff --git a/test/prototype/moe_training/test_fsdp.sh b/test/prototype/moe_training/test_fsdp.sh
index 5f858061f4..3674209a1f 100755
--- a/test/prototype/moe_training/test_fsdp.sh
+++ b/test/prototype/moe_training/test_fsdp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s
diff --git a/test/prototype/moe_training/test_fsdp_tp.py b/test/prototype/moe_training/test_fsdp_tp.py
index 4a7c1356c0..59e09e1f49 100644
--- a/test/prototype/moe_training/test_fsdp_tp.py
+++ b/test/prototype/moe_training/test_fsdp_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_fsdp_tp(target_fqns: list[str]):
diff --git a/test/prototype/moe_training/test_fsdp_tp.sh b/test/prototype/moe_training/test_fsdp_tp.sh
index 4c00dcd853..24f64b6724 100755
--- a/test/prototype/moe_training/test_fsdp_tp.sh
+++ b/test/prototype/moe_training/test_fsdp_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s
diff --git a/test/prototype/moe_training/test_tp.py b/test/prototype/moe_training/test_tp.py
index bf913a69b3..26d98450d5 100644
--- a/test/prototype/moe_training/test_tp.py
+++ b/test/prototype/moe_training/test_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_tp(target_fqns: list[str]):
diff --git a/test/prototype/moe_training/test_tp.sh b/test/prototype/moe_training/test_tp.sh
index 2ab7636113..6c9efd4933 100755
--- a/test/prototype/moe_training/test_tp.sh
+++ b/test/prototype/moe_training/test_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py -s