From 45fb201a634b1ea06d825655f09483c8a571a488 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Fri, 22 Aug 2025 15:20:48 -0700 Subject: [PATCH] [moe training] add test case for shared expert in distributed tests stack-info: PR: https://github.com/pytorch/ao/pull/2856, branch: danielvegamyhre/stack/56 --- test/prototype/moe_training/test_everything.sh | 6 +++--- test/prototype/moe_training/test_fsdp.py | 12 +++++++++--- test/prototype/moe_training/test_fsdp.sh | 2 +- test/prototype/moe_training/test_fsdp_tp.py | 5 ++--- test/prototype/moe_training/test_fsdp_tp.sh | 2 +- test/prototype/moe_training/test_tp.py | 5 ++--- test/prototype/moe_training/test_tp.sh | 2 +- 7 files changed, 19 insertions(+), 15 deletions(-) diff --git a/test/prototype/moe_training/test_everything.sh b/test/prototype/moe_training/test_everything.sh index 1a036cb7ea..d9164d4981 100755 --- a/test/prototype/moe_training/test_everything.sh +++ b/test/prototype/moe_training/test_everything.sh @@ -12,9 +12,9 @@ IS_ROCM=$(rocm-smi --version || true) # These tests do not work on ROCm yet if [ -z "$IS_ROCM" ] then -./test/prototype/moe_training/test_fsdp.sh -./test/prototype/moe_training/test_tp.sh -./test/prototype/moe_training/test_fsdp_tp.sh +./test_fsdp.sh +./test_tp.sh +./test_fsdp_tp.sh fi echo "all tests successful" diff --git a/test/prototype/moe_training/test_fsdp.py b/test/prototype/moe_training/test_fsdp.py index b205675527..b211a337a3 100644 --- a/test/prototype/moe_training/test_fsdp.py +++ b/test/prototype/moe_training/test_fsdp.py @@ -7,7 +7,7 @@ # # To run these unit tests, use the following command: # -# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp.py +# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py # ####################################################################### @@ -45,7 +45,14 @@ ) -def test_moe_float8_training_fsdp(): +@pytest.mark.parametrize( + "target_fqns", + [ + ["experts"], + ["experts,shared_expert"], + ], +) +def test_moe_float8_training_fsdp(target_fqns: list[str]): assert torch.cuda.is_available() # setup distributed for fsdp @@ -55,7 +62,6 @@ def test_moe_float8_training_fsdp(): set_token_group_alignment_size_m(16) # define model args - target_fqns = ["experts"] model_args = MoEArgs( num_experts=8, ) diff --git a/test/prototype/moe_training/test_fsdp.sh b/test/prototype/moe_training/test_fsdp.sh index 5f858061f4..3674209a1f 100755 --- a/test/prototype/moe_training/test_fsdp.sh +++ b/test/prototype/moe_training/test_fsdp.sh @@ -1 +1 @@ -torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp.py -s +torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp.py -s diff --git a/test/prototype/moe_training/test_fsdp_tp.py b/test/prototype/moe_training/test_fsdp_tp.py index 4a7c1356c0..59e09e1f49 100644 --- a/test/prototype/moe_training/test_fsdp_tp.py +++ b/test/prototype/moe_training/test_fsdp_tp.py @@ -7,7 +7,7 @@ # # To run these unit tests, use the following command: # -# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_fsdp_tp.py +# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py # ####################################################################### @@ -67,8 +67,7 @@ "target_fqns", [ ["experts"], - # TODO: investigate hang when shared_expert is converted - # ["experts,shared_expert"], + ["experts,shared_expert"], ], ) def test_moe_float8_training_fsdp_tp(target_fqns: list[str]): diff --git a/test/prototype/moe_training/test_fsdp_tp.sh b/test/prototype/moe_training/test_fsdp_tp.sh index 4c00dcd853..24f64b6724 100755 --- a/test/prototype/moe_training/test_fsdp_tp.sh +++ b/test/prototype/moe_training/test_fsdp_tp.sh @@ -1 +1 @@ -torchrun --nproc_per_node=4 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_fsdp_tp.py -s +torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_fsdp_tp.py -s diff --git a/test/prototype/moe_training/test_tp.py b/test/prototype/moe_training/test_tp.py index bf913a69b3..26d98450d5 100644 --- a/test/prototype/moe_training/test_tp.py +++ b/test/prototype/moe_training/test_tp.py @@ -7,7 +7,7 @@ # # To run these unit tests, use the following command: # -# torchrun --nproc_per_node=${NUM_GPUS} -m pytest test_tp.py +# torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py # ####################################################################### @@ -67,8 +67,7 @@ "target_fqns", [ ["experts"], - # TODO: investigate hang when shared_expert is converted - # ["experts,shared_expert"], + ["experts,shared_expert"], ], ) def test_moe_float8_training_tp(target_fqns: list[str]): diff --git a/test/prototype/moe_training/test_tp.sh b/test/prototype/moe_training/test_tp.sh index 2ab7636113..6c9efd4933 100755 --- a/test/prototype/moe_training/test_tp.sh +++ b/test/prototype/moe_training/test_tp.sh @@ -1 +1 @@ -torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test/prototype/moe_training/test_tp.py -s +torchrun --nproc_per_node=2 --local-ranks-filter=0 -m pytest test_tp.py -s