[mxfp8 moe] add support for fbgemm 2d-3d mx8mx8bf16 grouped gemm

danielvegamyhre · danielvegamyhre · commit a5a29db8b786 · 2025-08-22T07:42:15.000-07:00
diff --git a/test/prototype/moe_training/test_scaled_grouped_mm.py b/test/prototype/moe_training/test_scaled_grouped_mm.py
@@ -230,25 +230,27 @@ def compute_reference_forward(
 @pytest.mark.parametrize("num_experts", (1, 8, 16))
 def test_emulate_mxfp8_grouped_gemm_2d_3d(M, K, N, num_experts):
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
-    w_t = torch.randn(num_experts, K, N, dtype=torch.bfloat16, device="cuda")
+    w = torch.randn(num_experts, N, K, dtype=torch.bfloat16, device="cuda")
     offs = generate_jagged_offs(num_experts, M)
-    x_ref, w_t_ref, offs_ref = x.clone(), w_t.clone(), offs.clone()
+    x_ref, w_ref, offs_ref = x.clone(), w.clone(), offs.clone()
 
     # Quantize inputs to mxpf8 for emulated mxfp8 scaled grouped mm
     block_size = 32
-    x_scale, x_mx = to_mx(x, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+    x_scale, x_fp8 = to_mx(x, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
 
     # To cast B_t per-expert to mxfp8 across dim1, we transpose the experts, cast along dim -1, then untranspose.
-    w_scale, w_mx = to_mx(
-        w_t.transpose(-2, -1).contiguous(),
+    w_scale, w_fp8 = to_mx(
+        w,
         elem_dtype=torch.float8_e4m3fn,
         block_size=block_size,
     )
-    w_t_scale, w_t_mx = w_scale.transpose(-2, -1), w_mx.transpose(-2, -1)
+    w_t_scale, w_t_fp8 = w_scale.transpose(-2, -1), w_fp8.transpose(-2, -1)
 
-    ref_out = torch._grouped_mm(x_ref, w_t_ref, offs=offs_ref, out_dtype=torch.bfloat16)
+    ref_out = torch._grouped_mm(
+        x_ref, w_ref.transpose(-2, -1), offs=offs_ref, out_dtype=torch.bfloat16
+    )
     out = _emulated_mxfp8_scaled_grouped_mm_2d_3d(
-        x_mx, x_scale, w_t_mx, w_t_scale, offs=offs, out_dtype=torch.bfloat16
+        x_fp8, x_scale, w_t_fp8, w_t_scale, offs=offs, out_dtype=torch.bfloat16
     )
 
     sqnr = compute_error(ref_out, out)
@@ -314,9 +316,14 @@ def test_mxfp8_grouped_gemm_with_dq_fwd_bwd(M, K, N, num_experts):
 
     block_size = 32
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda", requires_grad=True)
-    w_t = torch.randn(
-        num_experts, K, N, dtype=torch.bfloat16, device="cuda", requires_grad=True
+    w = torch.randn(
+        num_experts,
+        N,
+        K,
+        dtype=torch.bfloat16,
+        device="cuda",
     )
+    w_t = w.transpose(-2, -1).requires_grad_(True)
     offs = generate_jagged_offs(num_experts, M, multiple_of=block_size)
     x_ref, w_t_ref, offs_ref = (
         x.clone().detach().requires_grad_(True),
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -136,7 +136,7 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         ["does.not.exist"],
     ],
 )
-@pytest.mark.parametrize("compile", [False, True])
+@pytest.mark.parametrize("compile", [False])
 def test_moe_mxfp8_training(target_fqns: list[str], compile: bool):
     block_size = 32
 
diff --git a/torchao/prototype/moe_training/kernels/__init__.py b/torchao/prototype/moe_training/kernels/__init__.py
@@ -7,3 +7,6 @@
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,
 )
+from torchao.prototype.moe_training.kernels.mxfp8 import (
+    fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,
+)
diff --git a/torchao/prototype/moe_training/kernels/mxfp8.py b/torchao/prototype/moe_training/kernels/mxfp8.py
@@ -0,0 +1,69 @@
+import logging
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+import torch
+
+try:
+    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+except Exception as e:
+    logging.warning(
+        f"fbgemm_gpu_genai package is required for this feature but import failed with exception: {e}"
+        "Please install nightly builds of pytorch and fbgemm_gpu_genai build using this command and try again: "
+        "pip3 install --force-reinstall --pre torch fbgemm-gpu-genai --index-url https://download.pytorch.org/whl/nightly/cu129"
+        "If errors persist, please file a bug report."
+    )
+
+
+@torch.library.custom_op("torchao::fbgemm_mxfp8_grouped_mm_2d_3d", mutates_args={})
+def fbgemm_mxfp8_grouped_mm_2d_3d(
+    A_mx: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_t_mx_dim1: torch.Tensor,
+    B_t_scales_dim1: torch.Tensor,
+    offs: torch.Tensor,
+    block_size: int = 32,
+    out_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    assert A_mx.ndim == 2, "A_mx tensor must be 2D"
+    assert B_t_mx_dim1.ndim == 3, "B_t_mx_dim1 tensor must be 3D"
+    assert block_size == 32, "Only block_size=32 is supported"
+    assert out_dtype == torch.bfloat16, "Only out_dtype=bfloat16 is supported"
+
+    # "A" and "offs" already have been padded so token group sizes along Mg are multiples of scaling block size (32).
+    # e.g. offs = [32, 64, 128]
+    # From this, we compute `group_sizes` and `starting_row_after_padding`:
+    # group_sizes = [32, 32, 64]
+    # starting_row_after_padding = [0, 32, 64, 128]
+    zero = torch.tensor([0], dtype=offs.dtype, device=offs.device)
+    group_sizes = torch.diff(offs, prepend=zero).to(torch.int64)
+    starting_row_after_paddding = torch.cat((zero, offs))
+    out = torch.ops.fbgemm.mx8mx8bf16_grouped_stacked(
+        A_mx,  # (Mg, K)
+        B_t_mx_dim1,  # (E, K, N)
+        A_scale,  # (Mg, K//block_size)
+        B_t_scales_dim1,  # (E, K//block_size, N)
+        group_sizes,  # size of each token group, computed from end idx of each group (`offs`)
+        starting_row_after_padding=starting_row_after_paddding,
+    )
+    return out
+
+
+@fbgemm_mxfp8_grouped_mm_2d_3d.register_fake
+def _fbgemm_mxfp8_grouped_mm_2d_3d_fake(
+    A_mx: torch.Tensor,
+    B_t_mx_dim1: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_t_scales_dim1: torch.Tensor,
+    offs: torch.Tensor,
+) -> torch.Tensor:
+    assert A_mx.ndim == 2, "A_mx tensor must be 2D"
+    assert B_t_mx_dim1.ndim == 3, "B_t_mx_dim1 tensor must be 3D"
+    mg, k = A_mx.shape
+    e, k, n = B_t_mx_dim1.shape
+    n_groups = offs.numel()
+    assert n_groups == e, (
+        "Size of `offs` (number of groups) must match first dim of `B_t_mx_dim1`"
+    )
+    output = torch.empty((mg, n), dtype=torch.bfloat16, device=A_mx.device)
+    return output
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -13,6 +13,7 @@
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
 from torchao.prototype.moe_training.conversion_utils import MoEScalingType
 from torchao.prototype.moe_training.kernels import (
+    fbgemm_mxfp8_grouped_mm_2d_3d,
     triton_fp8_per_group_colwise_scales,
     triton_fp8_per_group_rowwise_scales,
     triton_fp8_rowwise_3d_transpose_rhs,
@@ -283,7 +284,6 @@ def forward(
         assert A.ndim == 2, "A must be 2D"
         assert B_t.ndim == 3, "B must be 3D"
         assert block_size == 32, "Only block_size=32 is supported"
-        assert emulated, "Only emulated mxfp8 grouped gemm is supported"
 
         # Cast to mxpf8 across dim -1.
         # A_mx shape: (M, K)
@@ -314,11 +314,17 @@ def forward(
         ctx.save_for_backward(A, B_t, offs)
         ctx.block_size = block_size
         ctx.out_dtype = out_dtype
+        ctx.emulated = emulated
 
         # Perform scaled grouped GEMM and return result.
         # output = input @ weight.T
         # output shape: (M, N)
-        out = _emulated_mxfp8_scaled_grouped_mm_2d_3d(
+        mxfp8_2d_3d_grouped_mm = (
+            _emulated_mxfp8_scaled_grouped_mm_2d_3d
+            if emulated
+            else fbgemm_mxfp8_grouped_mm_2d_3d
+        )
+        out = mxfp8_2d_3d_grouped_mm(
             A_mx,
             A_scale,
             B_t_mx_dim1,
@@ -334,6 +340,7 @@ def backward(ctx, grad_out: torch.Tensor):
         A, B_t, offs = ctx.saved_tensors
         block_size = ctx.block_size
         out_dtype = ctx.out_dtype
+        emulated = ctx.emulated
 
         # grad_out_mx shape: (M, N)
         # grad_out_scale shape: (M, N//block_size)
@@ -355,7 +362,12 @@ def backward(ctx, grad_out: torch.Tensor):
 
         # Compute grad_A.
         # grad_A = scaled grouped mm of (M,N) @ (B,N,K) = (M,K)
-        grad_A = _emulated_mxfp8_scaled_grouped_mm_2d_3d(
+        mxfp8_2d_3d_grouped_mm = (
+            _emulated_mxfp8_scaled_grouped_mm_2d_3d
+            if emulated
+            else fbgemm_mxfp8_grouped_mm_2d_3d
+        )
+        grad_A = mxfp8_2d_3d_grouped_mm(
             grad_out_mx,
             grad_out_scale,
             B_mx_dim1,
@@ -385,7 +397,6 @@ def backward(ctx, grad_out: torch.Tensor):
         # Compute grad_B = grad_output_t @ A
         # grad_B_t = scaled grouped mm of (N,M) @ (M,K) = (E,N,K)
         # grad_B = grad_B_t.transpose(-2, -1) = (E,K,N)
-
         grad_B = _emulated_mxfp8_scaled_grouped_mm_2d_2d(
             grad_out_t_mx,
             grad_out_t_scales,

Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:`
`136`	`136`	`["does.not.exist"],`
`137`	`137`	`],`
`138`	`138`	`)`
`139`		`-@pytest.mark.parametrize("compile", [False, True])`
	`139`	`+@pytest.mark.parametrize("compile", [False])`
`140`	`140`	`def test_moe_mxfp8_training(target_fqns: list[str], compile: bool):`
`141`	`141`	`block_size = 32`
`142`	`142`
Original file line number	Diff line number	Diff line change
`@@ -7,3 +7,6 @@`
`7`	`7`	`from torchao.prototype.moe_training.kernels.jagged_float8_scales import (`
`8`	`8`	`triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,`
`9`	`9`	`)`
	`10`	`+from torchao.prototype.moe_training.kernels.mxfp8 import (`
	`11`	`+ fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,`
	`12`	`+)`