pytorch
diff --git a/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎
Lines changed: 36 additions & 2 deletions b/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎
Lines changed: 36 additions & 2 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_moe_fsdp.py‎ renamed to ‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 24 additions & 13 deletions b/‎benchmarks/prototype/moe_training/benchmark_moe_fsdp.py‎ renamed to ‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 24 additions & 13 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 71 additions & 17 deletions b/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 71 additions & 17 deletions
diff --git a/‎benchmarks/utils.py‎
Lines changed: 15 additions & 2 deletions b/‎benchmarks/utils.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎test/prototype/moe_training/test_everything.sh‎
Lines changed: 2 additions & 0 deletions b/‎test/prototype/moe_training/test_everything.sh‎
Lines changed: 2 additions & 0 deletions
@@ -6,6 +6,7 @@
 # this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
 import argparse
 import itertools
+import logging
 from dataclasses import dataclass
 from typing import List
 
@@ -105,10 +106,22 @@ def run_experiment(
     )
 
     # bench fp8 rowwise grouped mm
-    fp8_rowwise_us = bench_fp8_rowwise_grouped_mm(A, B_t, offs)
+    if torch.cuda.get_device_capability() != (9, 0):
+        logging.warning(
+            f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+        fp8_rowwise_us = float("inf")
+    else:
+        fp8_rowwise_us = bench_fp8_rowwise_grouped_mm(A, B_t, offs)
 
     # benchmark mxfp8 grouped mm
-    mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
+    if torch.cuda.get_device_capability() != (10, 0):
+        logging.warning(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
+        mxfp8_us = float("inf")
+    else:
+        mxfp8_us = bench_mxfp8_grouped_mm(A, B_t, offs)
 
     return ExperimentResult(
         bf16_us=round(bf16_us, 3),
@@ -126,9 +139,25 @@ def print_results(experiments: List[Experiment]):
         "bf16_time_us",
         "fp8_rowwise_time_us",
         "mxfp8_time_us",
+        "bf16_tflops",
+        "fp8_rowwise_tflops",
+        "mxfp8_tflops",
+        "fp8_rowwise_speedup",
+        "mxfp8_speedup",
     ]
     rows = []
     for experiment in experiments:
+        # calculate tflops
+        e, m, n, k = (
+            experiment.config.e,
+            experiment.config.m,
+            experiment.config.n,
+            experiment.config.k,
+        )
+        flops = 2 * e * m * n * k
+        bf16_tflops = (flops / 1e12) / (experiment.result.bf16_us / 1e6)
+        fp8_rowwise_tflops = (flops / 1e12) / (experiment.result.fp8_rowwise_us / 1e6)
+        mxfp8_tflops = (flops / 1e12) / (experiment.result.mxfp8_us / 1e6)
         rows.append(
             [
                 experiment.config.e,
@@ -138,6 +167,11 @@ def print_results(experiments: List[Experiment]):
                 experiment.result.bf16_us,
                 experiment.result.fp8_rowwise_us,
                 experiment.result.mxfp8_us,
+                round(bf16_tflops, 3),
+                round(fp8_rowwise_tflops, 3),
+                round(mxfp8_tflops, 3),
+                f"{experiment.result.bf16_us / experiment.result.fp8_rowwise_us:.2f}x",
+                f"{experiment.result.bf16_us / experiment.result.mxfp8_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))
 
@@ -7,12 +7,13 @@
 #
 # To run these benchmarks, use the following command:
 #
-# torchrun --nproc-per-node=8 --local-ranks-filter=0 torchao/prototype/moe_training/benchmarks/benchmark_moe_layer.py
+# torchrun --nproc-per-node=8 --local-ranks-filter=0 benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py
 #
 #######################################################################
 
 import argparse
 import copy
+import logging
 import os
 
 import pytest
@@ -23,13 +24,6 @@
 from torch.nn import functional as F
 
 from benchmarks.utils import bench_fwd_bwd_microseconds, profile_fwd_bwd
-
-# this feature requires CUDA and SM89+
-if not torch.cuda.is_available() or torch.cuda.get_device_capability() < (8, 9):
-    pytest.skip(
-        "CUDA not available or compute capability < 8.9", allow_module_level=True
-    )
-
 from torchao.prototype.moe_training.conversion_utils import (
     MoEScalingType,
     MoETrainingConfig,
@@ -48,12 +42,27 @@
     )
 
 
-def bench_moe_float8_training_fsdp(
-    recipe_name: str, enable_profile: bool, use_compile: bool
-):
+def bench_moe_training_fsdp(recipe_name: str, enable_profile: bool, use_compile: bool):
     assert torch.cuda.is_available()
     assert recipe_name in ["fp8_rowwise", "mxfp8"]
     recipe = MoEScalingType[recipe_name.upper()]
+    if recipe == MoEScalingType.FP8_ROWWISE and torch.cuda.get_device_capability() != (
+        9,
+        0,
+    ):
+        logging.warning(
+            f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+        )
+        return
+
+    elif recipe == MoEScalingType.MXFP8 and torch.cuda.get_device_capability() != (
+        10,
+        0,
+    ):
+        logging.warning(
+            f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+        )
+        return
 
     # setup distributed for fsdp
     setup_distributed()
@@ -157,14 +166,16 @@ def setup_distributed():
         action="store_true",
         help="Enable PyTorch profiling and save results to file",
     )
-    parser.add_argument("--recipe", type=str, help="[fp8_rowwise, mxfp8]")
+    parser.add_argument(
+        "--recipe", type=str, help="[fp8_rowwise, mxfp8]", required=True
+    )
     parser.add_argument(
         "--compile",
         action="store_true",
         help="use torch.compile",
     )
     args = parser.parse_args()
-    bench_moe_float8_training_fsdp(
+    bench_moe_training_fsdp(
         recipe_name=args.recipe,
         enable_profile=args.profile,
         use_compile=args.compile,
 
@@ -6,14 +6,19 @@
 # this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
 import argparse
 import itertools
+import logging
 from dataclasses import dataclass
 from typing import List
 
 import torch
 from tabulate import tabulate
 from tqdm import tqdm
 
-from benchmarks.utils import bench_fwd_bwd_microseconds, profile_fwd_bwd
+from benchmarks.utils import (
+    bench_fwd_bwd_microseconds,
+    bench_fwd_microseconds,
+    profile_fwd_bwd,
+)
 from torchao.prototype.moe_training import _scaled_grouped_mm
 from torchao.prototype.moe_training.conversion_utils import MoEScalingType
 from torchao.prototype.moe_training.utils import generate_jagged_offs
@@ -34,9 +39,12 @@ class ExperimentConfig:
 
 @dataclass(frozen=True)
 class ExperimentResult:
-    bf16_us: float
-    scaled_us: float
-    scaled_speedup: float
+    bf16_e2e_us: float
+    scaled_e2e_us: float
+    scaled_e2e_speedup: float
+    bf16_fwd_us: float
+    scaled_fwd_us: float
+    scaled_fwd_speedup: float
 
 
 @dataclass(frozen=True)
@@ -100,8 +108,8 @@ def run_experiment(
         (A.shape[0], B_t.shape[-1]), device=device, dtype=torch.bfloat16
     )
 
-    # benchmark bf16 grouped mm
-    bf16_us = bench_fwd_bwd_microseconds(
+    # E2E bf16 benchmark + profiling
+    bf16_e2e_us = bench_fwd_bwd_microseconds(
         torch._grouped_mm,
         A,
         B_t,
@@ -122,8 +130,8 @@ def run_experiment(
             profile_name="bf16_profile",
         )
 
-    # benchmark scaled grouped mm with dynamic fp8 rowwise quant
-    scaled_us = bench_fwd_bwd_microseconds(
+    # E2E scaled benchmark + profiling
+    scaled_e2e_us = bench_fwd_bwd_microseconds(
         _scaled_grouped_mm,
         A,
         B_t,
@@ -146,10 +154,32 @@ def run_experiment(
             fullgraph=False,
         )
 
+    # Forward pass benchmarks
+    bf16_fwd_us = bench_fwd_microseconds(
+        torch._grouped_mm,
+        A,
+        B_t,
+        offs,
+        use_compile=args.compile,
+        fullgraph=True,
+    )
+    scaled_fwd_us = bench_fwd_microseconds(
+        _scaled_grouped_mm,
+        A,
+        B_t,
+        offs,
+        scaling_type=config.recipe,
+        use_compile=args.compile,
+        fullgraph=True,
+    )
+
     return ExperimentResult(
-        bf16_us=round(bf16_us, 3),
-        scaled_us=round(scaled_us, 3),
-        scaled_speedup=round(bf16_us / scaled_us, 3),
+        bf16_e2e_us=round(bf16_e2e_us, 3),
+        scaled_e2e_us=round(scaled_e2e_us, 3),
+        scaled_e2e_speedup=round(bf16_e2e_us / scaled_e2e_us, 3),
+        bf16_fwd_us=round(bf16_fwd_us, 3),
+        scaled_fwd_us=round(scaled_fwd_us, 3),
+        scaled_fwd_speedup=round(bf16_fwd_us / scaled_fwd_us, 3),
     )
 
 
@@ -158,9 +188,12 @@ def print_results(experiments: List[Experiment]):
         "A_shape",
         "B_shape",
         "recipe",
-        "bf16_time_us",
-        "scaled_time_us",
-        "scaled_speedup",
+        "bf16_e2e_us",
+        "scaled_e2e_us",
+        "scaled_e2e_speedup",
+        "bf16_fwd_us",
+        "scaled_fwd_us",
+        "scaled_fwd_speedup",
     ]
     rows = []
     for experiment in experiments:
@@ -171,9 +204,12 @@ def print_results(experiments: List[Experiment]):
                 A_shape,
                 B_shape,
                 experiment.config.recipe,
-                experiment.result.bf16_us,
-                experiment.result.scaled_us,
-                f"{experiment.result.scaled_speedup}x",
+                experiment.result.bf16_e2e_us,
+                experiment.result.scaled_e2e_us,
+                f"{experiment.result.scaled_e2e_speedup}x",
+                experiment.result.bf16_fwd_us,
+                experiment.result.scaled_fwd_us,
+                f"{experiment.result.scaled_fwd_speedup}x",
             ]
         )
     print(tabulate(rows, headers=headers))
@@ -184,6 +220,24 @@ def main(args: argparse.Namespace):
     configs = get_configs()
     results = []
     for config in tqdm(configs):
+        if (
+            config.recipe == MoEScalingType.FP8_ROWWISE
+            and torch.cuda.get_device_capability() != (9, 0)
+        ):
+            logging.warning(
+                f"Skipping FP8 rowwise benchmarks, only supported on compute capability 9.0 and found {torch.cuda.get_device_capability()}"
+            )
+            continue
+
+        elif (
+            config.recipe == MoEScalingType.MXFP8
+            and torch.cuda.get_device_capability() != (10, 0)
+        ):
+            logging.warning(
+                f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
+            )
+            continue
+
         result = run_experiment(config, args)
         results.append(Experiment(config=config, result=result))
 
 
@@ -8,15 +8,28 @@ def bench_fwd_bwd_microseconds(
 ):
     assert labels is not None
 
-    def fwd_bwd():
+    def fwd_bwd(*args, **kwargs):
         out = fn(*args, **kwargs)
         loss = F.mse_loss(out, labels)
         loss.backward()
 
     fwd_bwd_compiled = (
         torch.compile(fwd_bwd, fullgraph=fullgraph) if use_compile else fwd_bwd
     )
-    return benchmark_cuda_function_in_microseconds(fwd_bwd_compiled)
+    return benchmark_cuda_function_in_microseconds(
+        fwd_bwd_compiled,
+        *args,
+        **kwargs,
+    )
+
+
+def bench_fwd_microseconds(fn, *args, use_compile=False, fullgraph=True, **kwargs):
+    fn_compiled = torch.compile(fn, fullgraph=fullgraph) if use_compile else fn
+    return benchmark_cuda_function_in_microseconds(
+        fn_compiled,
+        *args,
+        **kwargs,
+    )
 
 
 def profile_fwd_bwd(
 
@@ -12,6 +12,8 @@ IS_ROCM=$(rocm-smi --version || true)
 # These tests do not work on ROCm yet
 if [ -z "$IS_ROCM" ]
 then
+pytest test/prototype/moe_training/test_kernels.py -s
+pytest test/prototype/moe_training/test_training.py -s
 ./test/prototype/moe_training/test_fsdp.sh
 ./test/prototype/moe_training/test_tp.sh
 ./test/prototype/moe_training/test_fsdp_tp.sh