pytorch
diff --git a/‎examples/sam2_amg_server/generate_data.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/sam2_amg_server/generate_data.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/sam2_amg_server/result.csv‎
Lines changed: 70 additions & 70 deletions b/‎examples/sam2_amg_server/result.csv‎
Lines changed: 70 additions & 70 deletions
diff --git a/‎ruff.toml‎
Lines changed: 6 additions & 0 deletions b/‎ruff.toml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 5 additions & 4 deletions b/‎setup.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎test/float8/test_base.py‎
Lines changed: 5 additions & 1 deletion b/‎test/float8/test_base.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/float8/test_compile.py‎
Lines changed: 14 additions & 6 deletions b/‎test/float8/test_compile.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎test/float8/test_float8_utils.py‎
Lines changed: 65 additions & 0 deletions b/‎test/float8/test_float8_utils.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_mm.py‎
Lines changed: 74 additions & 0 deletions b/‎test/prototype/mx_formats/test_mx_mm.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎third_party/cutlass‎ b/‎third_party/cutlass‎
@@ -60,6 +60,8 @@ def latencies_statistics(data):
     mean = np.mean(data_array)
     # Calculate the median
     median = np.median(data_array)
+    # Calculate the 90th percentile
+    p90 = np.percentile(data_array, 90)
     # Calculate the 95th percentile
     p95 = np.percentile(data_array, 95)
     # Calculate the 99th percentile
@@ -74,6 +76,7 @@ def latencies_statistics(data):
         {
             "mean": mean,
             "median": median,
+            "p90": p90,
             "p95": p95,
             "p99": p99,
             "p999": p999,
 
@@ -2,3 +2,9 @@
 # Add linting rules here
 lint.select = ["F", "I"]
 lint.ignore = ["E731"]
+
+
+# Exclude third-party modules
+exclude = [
+    "third_party/*",
+]
@@ -215,10 +215,7 @@ def get_extensions():
     extra_link_args = []
     extra_compile_args = {
         "cxx": [f"-DPy_LIMITED_API={PY3_9_HEXCODE}"],
-        "nvcc": [
-            "-O3" if not debug_mode else "-O0",
-            "-t=0",
-        ],
+        "nvcc": ["-O3" if not debug_mode else "-O0", "-t=0", "-std=c++17"],
     }
 
     if not IS_WINDOWS:
@@ -257,12 +254,16 @@ def get_extensions():
         use_cutlass = True
         cutlass_dir = os.path.join(third_party_path, "cutlass")
         cutlass_include_dir = os.path.join(cutlass_dir, "include")
+        cutlass_tools_include_dir = os.path.join(
+            cutlass_dir, "tools", "util", "include"
+        )
         cutlass_extensions_include_dir = os.path.join(cwd, extensions_cuda_dir)
     if use_cutlass:
         extra_compile_args["nvcc"].extend(
             [
                 "-DTORCHAO_USE_CUTLASS",
                 "-I" + cutlass_include_dir,
+                "-I" + cutlass_tools_include_dir,
                 "-I" + cutlass_extensions_include_dir,
             ]
         )
 
@@ -164,7 +164,10 @@ def test_transpose(self):
 
     @pytest.mark.parametrize("shape", [(8, 16), (4, 8, 16), (2, 4, 8, 16)])
     @pytest.mark.parametrize("axiswise_dim", [0, -1])
-    def test_axiswise_dynamic_cast(self, shape, axiswise_dim):
+    @pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])
+    def test_axiswise_dynamic_cast(
+        self, shape, axiswise_dim, round_scales_to_power_of_2
+    ):
         a = torch.randn(*shape, dtype=torch.bfloat16)
         linear_mm_config = LinearMMConfig()
         a_fp8 = hp_tensor_to_float8_dynamic(
@@ -173,6 +176,7 @@ def test_axiswise_dynamic_cast(self, shape, axiswise_dim):
             linear_mm_config,
             scaling_granularity=ScalingGranularity.AXISWISE,
             axiswise_dim=axiswise_dim,
+            round_scales_to_power_of_2=round_scales_to_power_of_2,
         )
         a_dq = a_fp8.to_original_precision()
         sqnr = compute_error(a, a_dq)
 
@@ -45,11 +45,7 @@
     hp_tensor_to_float8_delayed,
     hp_tensor_to_float8_dynamic,
 )
-from torchao.float8.float8_tensor import (
-    GemmInputRole,
-    LinearMMConfig,
-    ScaledMMConfig,
-)
+from torchao.float8.float8_tensor import GemmInputRole, LinearMMConfig, ScaledMMConfig
 from torchao.float8.float8_utils import config_has_stateful_scaling
 from torchao.float8.stateful_float8_linear import StatefulFloat8Linear
 from torchao.testing.float8.test_utils import get_test_float8_linear_config
@@ -420,13 +416,23 @@ def test_sync_amax_func_cuda_graph_success():
         torch.float16,
     ],
 )
-def test_dynamic_scale_numeric_parity(dtype: torch.dtype):
+@pytest.mark.parametrize(
+    "round_scales_to_power_of_2",
+    [
+        True,
+        False,
+    ],
+)
+def test_dynamic_scale_numeric_parity(
+    dtype: torch.dtype, round_scales_to_power_of_2: bool
+):
     scaling_type_weight = ScalingType.DYNAMIC
     torch.manual_seed(42)
     hp_tensor1 = torch.randn(16, 16, device="cuda", dtype=dtype)
     hp_tensor2 = hp_tensor1.detach().clone()
     float8_config = Float8LinearConfig(
         cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        round_scales_to_power_of_2=round_scales_to_power_of_2,
     )
     linear_mm_config = LinearMMConfig(
         # output
@@ -456,13 +462,15 @@ def test_dynamic_scale_numeric_parity(dtype: torch.dtype):
         e4m3_dtype,
         linear_mm_config,
         gemm_input_role=GemmInputRole.WEIGHT,
+        round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
     )
     torch._dynamo.reset()
     float8_compile = torch.compile(hp_tensor_to_float8_dynamic)(
         hp_tensor2,
         e4m3_dtype,
         linear_mm_config,
         gemm_input_role=GemmInputRole.WEIGHT,
+        round_scales_to_power_of_2=float8_config.round_scales_to_power_of_2,
     )
     assert torch.equal(float8_eager._scale, float8_compile._scale)
     assert torch.equal(float8_eager._data, float8_compile._data)
 
@@ -0,0 +1,65 @@
+import unittest
+
+import pytest
+import torch
+
+from torchao.float8.float8_utils import _round_scale_down_to_power_of_2
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+
+if not TORCH_VERSION_AT_LEAST_2_5:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+
+# source for notable single-precision cases:
+# https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        # ("test_case_name", input, expected result)
+        ("one", 1.0, 1.0),
+        ("inf", float("inf"), float("inf")),
+        ("nan", float("nan"), float("nan")),
+        ("smallest positive subnormal number", 2**-126 * 2**-23, 2**-126 * 2**-23),
+        ("largest normal number", 2**127 * (2 - 2**-23), float("inf")),
+        ("smallest positive normal number", 2**-126, 2**-126),
+        ("largest number less than one", 1.0 - 2**-24, 0.5),
+        ("smallest number larger than one", 1.0 + 2**-23, 1.0),
+        # TODO(danielvegamyhre): debug why creating a tensor with largest
+        # subnormal value in CI env for pytorch 2.5.1 truncates the value to 0.
+        # ("largest subnormal number", [2**-126 * (1 - 2**-23), 1.1754943508222875e-38]),
+    ],
+)
+def test_round_scale_down_to_power_of_2_valid_inputs(
+    test_case: dict,
+):
+    test_case_name, input, expected_result = test_case
+    input_tensor, expected_tensor = (
+        torch.tensor(input, dtype=torch.float32).cuda(),
+        torch.tensor(expected_result, dtype=torch.float32).cuda(),
+    )
+    result = _round_scale_down_to_power_of_2(input_tensor)
+
+    assert (
+        torch.equal(result, expected_tensor)
+        or (result.isnan() and expected_tensor.isnan())
+    ), f"test: {test_case_name}, input: {input_tensor}, expected {expected_tensor}, but got {result}"
+
+
+@pytest.mark.parametrize(
+    "invalid_dtype",
+    [
+        torch.bfloat16,
+        torch.float16,
+        torch.float64,
+        torch.int8,
+        torch.uint8,
+        torch.int32,
+        torch.uint32,
+        torch.int64,
+    ],
+)
+def test_non_float32_input(invalid_dtype: torch.dtype):
+    non_float32_tensor = torch.tensor([3.0], dtype=invalid_dtype)
+    with pytest.raises(AssertionError, match="scale must be float32 tensor"):
+        _round_scale_down_to_power_of_2(non_float32_tensor)
@@ -0,0 +1,74 @@
+import pytest
+import torch
+
+from torchao.float8.float8_utils import compute_error
+from torchao.ops import mx_fp4_bf16, mx_fp8_bf16
+from torchao.prototype.mx_formats.mx_tensor import DTYPE_FP4, MXTensor
+from torchao.prototype.mx_formats.utils import to_blocked
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_4, is_sm_at_least_100
+
+if not TORCH_VERSION_AT_LEAST_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+
+def run_matrix_test(M: int, K: int, N: int, format) -> float:
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+
+    a = torch.rand((M, K), dtype=dtype, device=device)
+    b = torch.rand((N, K), dtype=dtype, device=device)
+
+    fmt = torch.float8_e4m3fn if format == "fp8" else DTYPE_FP4
+    mx_func = mx_fp8_bf16 if format == "fp8" else mx_fp4_bf16
+
+    a_mx = MXTensor.to_mx(a, fmt, 32)
+    b_mx = MXTensor.to_mx(b, fmt, 32)
+
+    a_data = a_mx._data
+    b_data = b_mx._data
+    assert b_data.is_contiguous()
+    b_data = b_data.transpose(-1, -2)
+
+    a_scale = a_mx._scale_e8m0.view(M, K // 32)
+    b_scale = b_mx._scale_e8m0.view(N, K // 32)
+
+    a_scale_block = to_blocked(a_scale)
+    b_scale_block = to_blocked(b_scale)
+
+    out_hp = a_mx.to_dtype(torch.bfloat16) @ b_mx.to_dtype(torch.bfloat16).transpose(
+        -1, -2
+    )
+    out = mx_func(a_data, b_data, a_scale_block, b_scale_block)
+
+    return compute_error(out_hp, out).item()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for mxfloat8"
+)
+@pytest.mark.parametrize(
+    "size",
+    [
+        (128, 128, 128),
+        (256, 256, 256),
+        (384, 384, 384),  # Small
+        (512, 512, 512),
+        (768, 768, 768),  # Medium
+        (1024, 1024, 1024),
+        (8192, 8192, 8192),  # Large
+        (128, 256, 384),
+        (256, 384, 512),  # Non-square
+        (129, 256, 384),
+        (133, 512, 528),  # Non-aligned
+    ],
+    ids=lambda x: f"{x[0]}x{x[1]}x{x[2]}",
+)
+@pytest.mark.parametrize("format", ["fp8", "fp4"])
+def test_matrix_multiplication(size, format):
+    M, K, N = size
+    sqnr = run_matrix_test(M, K, N, format)
+    threshold = 80.0
+    assert (
+        sqnr >= threshold
+    ), f"{format} SQNR {sqnr} below threshold for dims {M}x{K}x{N}"