pytorch · weifengpy · Oct 1, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -15,6 +15,9 @@
 
 import torch
 import torch.nn as nn
+from torchao.float8.float8_scaling_utils import (
+    hp_tensor_to_float8_dynamic,
+)
 
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -53,7 +56,7 @@
 is_cuda_8_9 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
 
 def bitwise_identical(a: Float8Tensor, b: Float8Tensor) -> bool:
-    assert torch.all(a._data == b._data).item(), "scales are not identical"
+    assert torch.all(a._scale == b._scale).item(), "scales are not identical"
     assert torch.all(a._data == b._data).item(), "data is not identical"
     return True
 
@@ -604,6 +607,35 @@ def test_small_amax_float16(self, float8_dtype):
         x = torch.tensor([target_amax], dtype=torch.float16, device="cuda")
         scale = tensor_to_scale(x, float8_dtype)
         assert not torch.any(torch.isinf(scale))
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            torch.float32,
+            torch.bfloat16,
+            torch.float16,
+        ],
+    )
+    def test_dynamic_scale_parity(self, dtype: torch.dtype):
+        scaling_type_weight = ScalingType.DYNAMIC
+        torch.manual_seed(42)
+        hp_tensor = torch.randn(768, 32, device="cuda", dtype=dtype)
+        float8_config = Float8LinearConfig(
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        )
+        float8_eager = hp_tensor_to_float8_dynamic(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        float8_compile = torch.compile(hp_tensor_to_float8_dynamic)(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        assert bitwise_identical(float8_eager, float8_compile)
 
 
 class TestFloat8LinearUtils(unittest.TestCase):

diff --git a/torchao/float8/float8_tensor.py b/torchao/float8/float8_tensor.py
@@ -163,7 +163,10 @@ def forward(
 
         DTensor Invariant: DTensor must always be the outer most tensor subclass
         """
-        tensor_scaled = tensor * scale
+        # Required by scaled_mm, scale is always float32.
+        # Cast tensor to float32 to improve numerics and
+        # get on-par with torch.compile.
+        tensor_scaled = tensor.to(torch.float32) * scale
         bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
 
         if isinstance(bits_fp8, DTensor):

diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py
@@ -42,6 +42,9 @@ def amax_to_scale(
         float8_dtype: The float8 dtype.
         orig_dtype: The original dtype of the tensor.
     """
+    # Preserve precision in amax-to-scale conversion
+    # and ensure on-par numerics with torch.compile
+    amax = amax.to(torch.float64)
     if float8_dtype in FP8_TYPES:
         res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
     else: