[moe training] add test case for shared expert in distributed tests

danielvegamyhre · danielvegamyhre · commit 613e2280bd65 · 2025-08-22T15:20:48.000-07:00
diff --git a/test/prototype/moe_training/test_everything.sh b/test/prototype/moe_training/test_everything.sh
@@ -12,9 +12,9 @@ IS_ROCM=$(rocm-smi --version || true)
 # These tests do not work on ROCm yet
 if [ -z "$IS_ROCM" ]
 then
-./test/prototype/moe_training/test_fsdp.sh
-./test/prototype/moe_training/test_tp.sh
-./test/prototype/moe_training/test_fsdp_tp.sh
+./test_fsdp.sh
+./test_tp.sh
+./test_fsdp_tp.sh
 fi
 
 echo "all tests successful"
diff --git a/test/prototype/moe_training/test_fsdp.py b/test/prototype/moe_training/test_fsdp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py
 #
 #######################################################################
 
@@ -45,7 +45,14 @@
     )
 
 
-def test_moe_float8_training_fsdp():
+@pytest.mark.parametrize(
+    "target_fqns",
+    [
+        ["experts"],
+        ["experts,shared_expert"],
+    ],
+)
+def test_moe_float8_training_fsdp(target_fqns: list[str]):
     assert torch.cuda.is_available()
 
     # setup distributed for fsdp
@@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp():
     set_token_group_alignment_size_m(16)
 
     # define model args
-    target_fqns = ["experts"]
     model_args = MoEArgs(
         num_experts=8,
     )
diff --git a/test/prototype/moe_training/test_fsdp.sh b/test/prototype/moe_training/test_fsdp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s
diff --git a/test/prototype/moe_training/test_fsdp_tp.py b/test/prototype/moe_training/test_fsdp_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_fsdp_tp(target_fqns: list[str]):
diff --git a/test/prototype/moe_training/test_fsdp_tp.sh b/test/prototype/moe_training/test_fsdp_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s
diff --git a/test/prototype/moe_training/test_tp.py b/test/prototype/moe_training/test_tp.py
@@ -7,7 +7,7 @@
 #
 # To run these unit tests, use the following command:
 #
-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_tp.py
+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py
 #
 #######################################################################
 
@@ -67,8 +67,7 @@
     "target_fqns",
     [
         ["experts"],
-        # TODO: investigate hang when shared_expert is converted
-        # ["experts,shared_expert"],
+        ["experts,shared_expert"],
     ],
 )
 def test_moe_float8_training_tp(target_fqns: list[str]):
diff --git a/test/prototype/moe_training/test_tp.sh b/test/prototype/moe_training/test_tp.sh
@@ -1 +1 @@
-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_tp.py -s
+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py -s

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`#`
`8`	`8`	`# To run these unit tests, use the following command:`
`9`	`9`	`#`
`10`		`-# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py`
	`10`	`+# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py`
`11`	`11`	`#`
`12`	`12`	`#######################################################################`
`13`	`13`
`@@ -45,7 +45,14 @@`
`45`	`45`	`)`
`46`	`46`
`47`	`47`
`48`		`-def test_moe_float8_training_fsdp():`
	`48`	`+@pytest.mark.parametrize(`
	`49`	`+ "target_fqns",`
	`50`	`+ [`
	`51`	`+ ["experts"],`
	`52`	`+ ["experts,shared_expert"],`
	`53`	`+ ],`
	`54`	`+)`
	`55`	`+def test_moe_float8_training_fsdp(target_fqns: list[str]):`
`49`	`56`	`assert torch.cuda.is_available()`
`50`	`57`
`51`	`58`	`# setup distributed for fsdp`
`@@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp():`
`55`	`62`	`set_token_group_alignment_size_m(16)`
`56`	`63`
`57`	`64`	`# define model args`
`58`		`- target_fqns = ["experts"]`
`59`	`65`	`model_args = MoEArgs(`
`60`	`66`	`num_experts=8,`
`61`	`67`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s`
	`1`	`+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s`
	`1`	`+torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s`