Add fp4 quantization swizzling tests (#1157)

wenscarl · web-flow · commit f230eb6636be · 2025-06-19T19:55:27.000-07:00
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
@@ -1,11 +1,14 @@
+import functools
+
 import pytest
 import torch
 
 from flashinfer import fp4_quantize
 from flashinfer.utils import is_sm100a_supported
 
 DTYPES = [torch.float16, torch.bfloat16]
-SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+# The batch dimension doesn't need to be multiple of 128
+SHAPES = [(128, 64), (256, 128), (120, 64), (200, 256)]
 SEEDS = [42]
 CUDA_DEVICES = ["cuda:0"]
 
@@ -42,6 +45,67 @@
 BLOCK_SIZE = 16
 
 
+def swizzle_sf(
+    unswizzled_sf: torch.Tensor,
+    original_row: int,
+    original_col: int,
+    scaling_vector_size: int = 16,
+) -> torch.Tensor:
+    """
+    Inverse of `unswizzle_sf`. Converts an unswizzled tensor back to swizzled form.
+
+    Args:
+        unswizzled_sf: Tensor of shape [row, col // scaling_vector_size].
+        original_row: Original row dimension (e.g., 120).
+        original_col: Original column dimension (e.g., 64).
+        scaling_vector_size: Scaling factor (default 16).
+
+    Returns:
+        Swizzled tensor of shape [padded_row, padded_col // scaling_vector_size].
+    """
+    unswizzled_sf = unswizzled_sf.contiguous()
+    factor = scaling_vector_size * 4
+    padded_row = ((original_row + 128 - 1) // 128) * 128  # Next multiple of 128
+    padded_col = ((original_col + factor - 1) // factor) * factor  # Next multiple of 64
+
+    # Pad the input tensor to [padded_row, padded_col // scaling_vector_size]
+    pad_rows = padded_row - original_row
+    pad_cols = (padded_col - original_col) // scaling_vector_size
+    padded_sf = torch.nn.functional.pad(
+        unswizzled_sf,
+        (0, pad_cols, 0, pad_rows),
+        mode="constant",
+        value=0,
+    ).contiguous()
+
+    # Reshape and transpose to reverse unswizzle_sf
+    num_m_tiles = padded_row // 128
+    num_k_tiles = padded_col // factor
+    sf_reshaped = padded_sf.view(num_m_tiles, 4, 32, num_k_tiles, 4)  # Reverse reshape
+    sf_swizzled = sf_reshaped.transpose(
+        1, 3
+    )  # Reverse transpose [num_m_tiles, num_k_tiles, 32, 4, 4]
+    sf_swizzled = sf_swizzled.reshape(
+        padded_row, padded_col // scaling_vector_size
+    )  # Flatten to [128, 64]
+
+    return sf_swizzled.contiguous()
+
+
+def unswizzle_sf(
+    sf: torch.Tensor, row: int, col: int, scaling_vector_size: int = 16
+) -> torch.Tensor:
+    factor = scaling_vector_size * 4
+    num_m_tiles = (row + 128 - 1) // 128
+    num_k_tiles = (col + factor - 1) // factor
+    # SF layout [num_m_tiles, num_k_tiles, 32 (m_tile column major), 4 (m_tile column major), 4(k_tile)]
+    sf_reshaped = sf.view(num_m_tiles, num_k_tiles, 32, 4, 4)
+    sf_unswizzle = sf_reshaped.transpose(1, 3)
+    sf_unswizzle = sf_unswizzle.reshape(num_m_tiles * 32 * 4, num_k_tiles * 4)
+    sf_unswizzle_sliced = sf_unswizzle[:row, : (col // scaling_vector_size)]
+    return sf_unswizzle_sliced.contiguous()
+
+
 def cast_from_fp4(x, m, n):
     # The fp4 values are packed in uint8 as [v_1st | v_2nd]
     v_2nd = x & 0xF
@@ -107,23 +171,24 @@ def recover_swizzled_scales(scale, m, n):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_quantize_to_fp4(
+def test_fp4_quantization(
     dtype: torch.dtype,
     shape: tuple[int, int],
     seed: int,
     device: str,
 ) -> None:
-    if not is_sm100a_supported(torch.device("cuda")):
+    if not is_sm100a_supported(torch.device(device)):
         pytest.skip("Nvfp4 Requires compute capability of 10 or above")
     torch.set_default_device(device)
+    torch.manual_seed(seed)
     m, n = shape
     x = torch.randn((m, n), dtype=dtype)
     tensor_amax = torch.abs(x).max().to(torch.float32)
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
     out, out_scale = fp4_quantize(x, global_scale, BLOCK_SIZE, False)
-    assert (n % BLOCK_SIZE == 0, f"cols needs to be {BLOCK_SIZE} divisible")
+    assert n % BLOCK_SIZE == 0, f"cols needs to be {BLOCK_SIZE} divisible"
     scale_ans = recover_swizzled_scales(
         out_scale.reshape(-1, n // BLOCK_SIZE).view(torch.float8_e4m3fn), m, n
     )
@@ -132,5 +197,42 @@ def test_quantize_to_fp4(
     torch.testing.assert_close(scale_ans, scale_ref, rtol=1e-1, atol=1e-1)
 
 
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_scale_swizzling(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    if not is_sm100a_supported(torch.device("cuda")):
+        pytest.skip("Nvfp4 Requires compute capability of 10 or above")
+    torch.set_default_device(device)
+    torch.manual_seed(seed)
+    m, n = shape
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    _, unswizzled_scale = fp4_quantize(x, global_scale, BLOCK_SIZE, False, False)
+    _, swizzled_scale = fp4_quantize(x, global_scale, BLOCK_SIZE, False, True)
+    assert n % BLOCK_SIZE == 0, f"cols needs to be {BLOCK_SIZE} divisible"
+    recovered_unswizzled_scale = unswizzle_sf(
+        swizzle_sf(unswizzled_scale, m, n),
+        m,
+        n,
+    )
+
+    # We don't expect the following since padding:
+    # swizzle_sf(unswizzled_scale) == swizzled_scale
+    ref_unswizzled_scale = unswizzle_sf(swizzled_scale, m, n)
+    assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
+    assert_equal(recovered_unswizzled_scale, unswizzled_scale)
+    assert_equal(ref_unswizzled_scale, unswizzled_scale)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])