pytorch
diff --git a/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎
Lines changed: 36 additions & 2 deletions b/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎
Lines changed: 36 additions & 2 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_moe_fsdp.py‎ renamed to ‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 24 additions & 13 deletions b/‎benchmarks/prototype/moe_training/benchmark_moe_fsdp.py‎ renamed to ‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 19 additions & 0 deletions b/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎test/prototype/moe_training/test_everything.sh‎
Lines changed: 2 additions & 0 deletions b/‎test/prototype/moe_training/test_everything.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/prototype/moe_training/test_fsdp.py‎
Lines changed: 57 additions & 14 deletions b/‎test/prototype/moe_training/test_fsdp.py‎
Lines changed: 57 additions & 14 deletions
@@ -6,6 +6,7 @@
 # this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
 import argparse
 import itertools
+import logging
 from dataclasses import dataclass
 from typing import List
 
@@ -105,10 +106,22 @@ def run_experiment(
     )
 
     # bench fp8 rowwise grouped mm
-    fp8_rowwise_us = bench_fp8_rowwise_grouped_mm(A, B_t, offs)
+    if torch.cuda.get_device_capability() != (9, 0):
+        logging.warning(
+            f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+        fp8_rowwise_us = float("inf")
+    else:
+        fp8_rowwise_us = bench_fp8_rowwise_grouped_mm(A, B_t, offs)
 
     # benchmark mxfp8 grouped mm
-    mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
+    if torch.cuda.get_device_capability() != (10, 0):
+        logging.warning(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
+        mxfp8_us = float("inf")
+    else:
+        mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
 
     return ExperimentResult(
         bf16_us=round(bf16_us, 3),
@@ -126,9 +139,25 @@ def print_results(experiments: List[Experiment]):
         "bf16_time_us",
         "fp8_rowwise_time_us",
         "mxfp8_time_us",
+        "bf16_tflops",
+        "fp8_rowwise_tflops",
+        "mxfp8_tflops",
+        "fp8_rowwise_speedup",
+        "mxfp8_speedup",
     ]
     rows = []
     for experiment in experiments:
+        # calculate tflops
+        e, m, n, k = (
+            experiment.config.e,
+            experiment.config.m,
+            experiment.config.n,
+            experiment.config.k,
+        )
+        flops = 2 * e * m * n * k
+        bf16_tflops = (flops / 1e12) / (experiment.result.bf16_us / 1e6)
+        fp8_rowwise_tflops = (flops / 1e12) / (experiment.result.fp8_rowwise_us / 1e6)
+        mxfp8_tflops = (flops / 1e12) / (experiment.result.mxfp8_us / 1e6)
         rows.append(
             [
                 experiment.config.e,
@@ -138,6 +167,11 @@ def print_results(experiments: List[Experiment]):
                 experiment.result.bf16_us,
                 experiment.result.fp8_rowwise_us,
                 experiment.result.mxfp8_us,
+                round(bf16_tflops, 3),
+                round(fp8_rowwise_tflops, 3),
+                round(mxfp8_tflops, 3),
+                f"{experiment.result.bf16_us / experiment.result.fp8_rowwise_us:.2f}x",
+                f"{experiment.result.bf16_us / experiment.result.mxfp8_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))
 
@@ -7,12 +7,13 @@
 #
 # To run these benchmarks, use the following command:
 #
-# torchrun --nproc-per-node=8 --local-ranks-filter=0 torchao/prototype/moe_training/benchmarks/benchmark_moe_layer.py
+# torchrun --nproc-per-node=8 --local-ranks-filter=0 benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py
 #
 #######################################################################
 
 import argparse
 import copy
+import logging
 import os
 
 import pytest
@@ -23,13 +24,6 @@
 from torch.nn import functional as F
 
 from benchmarks.utils import bench_fwd_bwd_microseconds, profile_fwd_bwd
-
-# this feature requires CUDA and SM89+
-if not torch.cuda.is_available() or torch.cuda.get_device_capability() < (8, 9):
-    pytest.skip(
-        "CUDA not available or compute capability < 8.9", allow_module_level=True
-    )
-
 from torchao.prototype.moe_training.conversion_utils import (
     MoEScalingType,
     MoETrainingConfig,
@@ -48,12 +42,27 @@
     )
 
 
-def bench_moe_float8_training_fsdp(
-    recipe_name: str, enable_profile: bool, use_compile: bool
-):
+def bench_moe_training_fsdp(recipe_name: str, enable_profile: bool, use_compile: bool):
     assert torch.cuda.is_available()
     assert recipe_name in ["fp8_rowwise", "mxfp8"]
     recipe = MoEScalingType[recipe_name.upper()]
+    if recipe == MoEScalingType.FP8_ROWWISE and torch.cuda.get_device_capability() != (
+        9,
+        0,
+    ):
+        logging.warning(
+            f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+        return
+
+    elif recipe == MoEScalingType.MXFP8 and torch.cuda.get_device_capability() != (
+        10,
+        0,
+    ):
+        logging.warning(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
+        return
 
     # setup distributed for fsdp
     setup_distributed()
@@ -157,14 +166,16 @@ def setup_distributed():
         action="store_true",
         help="Enable PyTorch profiling and save results to file",
     )
-    parser.add_argument("--recipe", type=str, help="[fp8_rowwise, mxfp8]")
+    parser.add_argument(
+        "--recipe", type=str, help="[fp8_rowwise, mxfp8]", required=True
+    )
     parser.add_argument(
         "--compile",
         action="store_true",
         help="use torch.compile",
     )
     args = parser.parse_args()
-    bench_moe_float8_training_fsdp(
+    bench_moe_training_fsdp(
         recipe_name=args.recipe,
         enable_profile=args.profile,
         use_compile=args.compile,
 
@@ -6,6 +6,7 @@
 # this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
 import argparse
 import itertools
+import logging
 from dataclasses import dataclass
 from typing import List
 
@@ -184,6 +185,24 @@ def main(args: argparse.Namespace):
     configs = get_configs()
     results = []
     for config in tqdm(configs):
+        if (
+            config.recipe == MoEScalingType.FP8_ROWWISE
+            and torch.cuda.get_device_capability() != (9, 0)
+        ):
+            logging.warning(
+                f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+            )
+            continue
+
+        elif (
+            config.recipe == MoEScalingType.MXFP8
+            and torch.cuda.get_device_capability() != (10, 0)
+        ):
+            logging.warning(
+                f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+            )
+            continue
+
         result = run_experiment(config, args)
         results.append(Experiment(config=config, result=result))
 
 
@@ -12,6 +12,8 @@ IS_ROCM=$(rocm-smi --version || true)
 # These tests do not work on ROCm yet
 if [ -z "$IS_ROCM" ]
 then
+pytest test/prototype/moe_training/test_kernels.py -s
+pytest test/prototype/moe_training/test_training.py -s
 ./test/prototype/moe_training/test_fsdp.sh
 ./test/prototype/moe_training/test_tp.sh
 ./test/prototype/moe_training/test_fsdp_tp.sh
 
@@ -34,15 +34,15 @@
         "CUDA not available or compute capability < 8.9", allow_module_level=True
     )
 
-from testing_utils import _validate_model_conversion
-
 from torchao.float8.float8_utils import compute_error
 from torchao.prototype.moe_training.conversion_utils import (
     MoEScalingType,
     MoETrainingConfig,
 )
 from torchao.quantization.quant_api import quantize_
 
+from .testing_utils import _validate_model_conversion
+
 # this test requires torchtitan
 try:
     from torchtitan.distributed.expert_parallel import set_token_group_alignment_size_m
@@ -54,27 +54,71 @@
 
 
 @pytest.mark.parametrize(
-    "recipe, min_out_sqnr, alignment_size, min_param_grad_sqnr",
+    "target_fqns",
+    [
+        ["experts"],
+        ["does.not.exist"],
+    ],
+)
+@pytest.mark.parametrize("compile", [False, True])
+@pytest.mark.parametrize(
+    "recipe_config",
     [
-        (MoEScalingType.FP8_ROWWISE, 29.0, 16, 23.0),
-        (MoEScalingType.MXFP8, 28.0, 32, 21.0),
+        {
+            "recipe": MoEScalingType.FP8_ROWWISE,
+            "group_alignment_size": 16,
+            "min_out_sqnr": 29.0,
+            "min_input_grad_sqnr": 29.0,
+            "min_param_grad_sqnr": 23.0,
+        },
+        {
+            "recipe": MoEScalingType.MXFP8,
+            "group_alignment_size": 32,
+            "min_out_sqnr": 28.0,
+            "min_input_grad_sqnr": 29.0,
+            "min_param_grad_sqnr": 21.0,
+        },
     ],
 )
-def test_moe_float8_training_fsdp(
-    recipe: MoEScalingType,
-    min_out_sqnr: float,
-    alignment_size: int,
-    min_param_grad_sqnr: float,
-):
+def test_moe_training_fsdp(target_fqns: list[str], compile: bool, recipe_config: dict):
+    (
+        recipe,
+        group_alignment_size,
+        min_out_sqnr,
+        min_input_grad_sqnr,
+        min_param_grad_sqnr,
+    ) = (
+        recipe_config["recipe"],
+        recipe_config["group_alignment_size"],
+        recipe_config["min_out_sqnr"],
+        recipe_config["min_input_grad_sqnr"],
+        recipe_config["min_param_grad_sqnr"],
+    )
     assert torch.cuda.is_available()
+    if recipe == MoEScalingType.FP8_ROWWISE and torch.cuda.get_device_capability() != (
+        9,
+        0,
+    ):
+        pytest.skip(
+            f"Skipping FP8 rowwise tests, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+
+    elif recipe == MoEScalingType.MXFP8 and torch.cuda.get_device_capability() != (
+        10,
+        0,
+    ):
+        pytest.skip(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
 
     # setup distributed for fsdp
     setup_distributed()
 
-    set_token_group_alignment_size_m(alignment_size)
+    # set token group alignment size needed for GEMM (contraction dim stride must be 16 byte aligned)
+    # or quantization ops (mxfp8 scaling groups are size 1x32)
+    set_token_group_alignment_size_m(group_alignment_size)
 
     # define model args
-    target_fqns = ["experts"]
     model_args = MoEArgs(
         num_experts=8,
     )
@@ -143,7 +187,6 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
 
     # validate input gradient
     input_grad_sqnr = compute_error(x.grad, ref_x.grad)
-    min_input_grad_sqnr = 29.0
     assert input_grad_sqnr.item() >= min_input_grad_sqnr, (
         f"SQNR must be >= {min_input_grad_sqnr}, got {input_grad_sqnr.item()}."
     )