pytorch
diff --git a/‎docs/source/api_ref_qat.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/api_ref_qat.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/quantization/pt2e/test_duplicate_dq.py‎
Lines changed: 0 additions & 1 deletion b/‎test/quantization/pt2e/test_duplicate_dq.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/quantization/pt2e/test_quantize_pt2e.py‎
Lines changed: 0 additions & 1 deletion b/‎test/quantization/pt2e/test_quantize_pt2e.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/quantization/pt2e/test_quantize_pt2e_qat.py‎
Lines changed: 0 additions & 1 deletion b/‎test/quantization/pt2e/test_quantize_pt2e_qat.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/quantization/pt2e/test_representation.py‎
Lines changed: 0 additions & 1 deletion b/‎test/quantization/pt2e/test_representation.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/quantization/pt2e/test_x86inductor_quantizer.py‎
Lines changed: 0 additions & 1 deletion b/‎test/quantization/pt2e/test_x86inductor_quantizer.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/quantization/test_qat.py‎
Lines changed: 99 additions & 33 deletions b/‎test/quantization/test_qat.py‎
Lines changed: 99 additions & 33 deletions
diff --git a/‎torchao/quantization/qat/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎torchao/quantization/qat/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -26,10 +26,12 @@ Custom QAT APIs
 
     FakeQuantizeConfigBase
     IntxFakeQuantizeConfig
+    Float8FakeQuantizeConfig
     FakeQuantizedLinear
     FakeQuantizedEmbedding
     FakeQuantizerBase
     IntxFakeQuantizer
+    Float8FakeQuantizer
     linear.enable_linear_fake_quant
     linear.disable_linear_fake_quant
 
 
@@ -11,7 +11,6 @@
 from typing import Any
 
 import torch
-from torch.export import export_for_training
 from torch.testing._internal.common_quantization import QuantizationTestCase
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests
 
 
@@ -19,7 +19,6 @@
     per_channel_weight_observer_range_neg_127_to_127,
     weight_observer_range_neg_127_to_127,
 )
-from torch.export import export_for_training
 from torch.fx import Node
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
 
@@ -18,7 +18,6 @@
     default_symmetric_qnnpack_qat_qconfig,
 )
 from torch.ao.quantization.quantize_fx import prepare_qat_fx
-from torch.export import export_for_training
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
 
@@ -11,7 +11,6 @@
 
 import torch
 from torch._higher_order_ops.out_dtype import out_dtype  # noqa: F401
-from torch.export import export_for_training
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
 )
 
@@ -12,7 +12,6 @@
 
 import torch
 import torch.nn as nn
-from torch.export import export_for_training
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
 )
 
@@ -14,17 +14,21 @@
 
 import torch
 import torch.nn.functional as F
-from parameterized import parameterized
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.testing._internal.common_utils import (
+    TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
+)
 
 from torchao import quantize_
-from torchao.float8.config import ScalingGranularity
-from torchao.float8.float8_scaling_utils import hp_tensor_to_float8_dynamic
-from torchao.float8.float8_training_tensor import LinearMMConfig
+from torchao.quantization import Float8Tensor
 from torchao.quantization.granularity import (
+    Granularity,
     PerAxis,
     PerGroup,
     PerRow,
+    PerTensor,
     PerToken,
 )
 from torchao.quantization.linear_quant_modules import (
@@ -43,11 +47,12 @@
     FakeQuantizedEmbedding,
 )
 from torchao.quantization.qat.fake_quantize_config import (
+    Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
 )
 from torchao.quantization.qat.fake_quantizer import (
+    Float8FakeQuantizer,
     IntxFakeQuantizer,
-    _Float8RowwiseActivationFakeQuantizer,
 )
 from torchao.quantization.qat.linear import (
     FakeQuantizedLinear,
@@ -58,10 +63,11 @@
 from torchao.quantization.qat.utils import (
     _fake_quantize_per_channel_group,
     _fake_quantize_per_token,
-    _Float8RowwiseFakeQuantize,
     _get_qmin_qmax,
 )
 from torchao.quantization.quant_api import (
+    Float8DynamicActivationFloat8WeightConfig,
+    Float8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt4WeightConfig,
 )
 from torchao.quantization.quant_primitives import (
@@ -83,6 +89,10 @@
     get_groupwise_affine_qparams,
     groupwise_affine_quantize_tensor,
 )
+from torchao.utils import (
+    _is_fbgemm_genai_gpu_available,
+    is_sm_at_least_89,
+)
 
 # TODO: put this in a common test utils file
 _CUDA_IS_AVAILABLE = torch.cuda.is_available()
@@ -193,7 +203,7 @@ def forward(self, x):
         return x
 
 
-class TestQAT(unittest.TestCase):
+class TestQAT(TestCase):
     SEED = 123
 
     def test_fake_quantize_per_channel_group(self):
@@ -1420,7 +1430,7 @@ def test_qat_linear_bias(self):
         example_inputs = m.example_inputs()
         m(*example_inputs)
 
-    @parameterized.expand([(torch.float32,), (torch.bfloat16,), (torch.float16,)])
+    @parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
     def test_fake_quantize_per_token_vs_convert(self, dtype: torch.dtype):
         """
         Test that the following produce the exact same numerics:
@@ -1437,7 +1447,7 @@ def test_fake_quantize_per_token_vs_convert(self, dtype: torch.dtype):
         baseline_out = per_token_dynamic_quant(x)
         torch.testing.assert_close(fake_quantizer_out, baseline_out, atol=0, rtol=0)
 
-    @parameterized.expand([(torch.float32,), (torch.bfloat16,), (torch.float16,)])
+    @parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
     def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
         """
         Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
@@ -1548,7 +1558,7 @@ def test_qat_8da4w_eps(self):
         actual_out = converted_model.linear1(x)
         torch.testing.assert_close(expected_out, actual_out, atol=0, rtol=0)
 
-    @parameterized.expand([(True,), (False,)])
+    @parametrize("is_symmetric", [True, False])
     def test_fake_quantizer_range_learning(self, is_symmetric):
         """
         Test that range learning requires `IntxFakeQuantizer`s to be initialized correctly.
@@ -1589,7 +1599,7 @@ def test_fake_quantizer_range_learning(self, is_symmetric):
             self.assertTrue(fake_quantizer.zero_point.requires_grad)
         fake_quantizer(*example_inputs)
 
-    @parameterized.expand([(True,), (False,)])
+    @parametrize("is_symmetric", [True, False])
     def test_qat_range_learning(self, is_symmetric):
         """
         Test end-to-end QAT flow with range learning.
@@ -1664,24 +1674,6 @@ def test_qat_range_learning(self, is_symmetric):
             self.assertNotEqual(torch.count_nonzero(new_weight.grad), 0)
             self.assertFalse(torch.equal(new_weight, prev_weight))
 
-    def test_float8_rowwise_fake_quantize(self):
-        """
-        Test that `_Float8RowwiseFakeQuantize` is numerically close to `Float8TrainingTensor`.
-        """
-        torch.manual_seed(self.SEED)
-        dtype = torch.float8_e4m3fn
-        x = torch.randn(32, 64)
-        axiswise_dim = 0
-        out = _Float8RowwiseFakeQuantize.apply(x, dtype, axiswise_dim)
-        out_expected = hp_tensor_to_float8_dynamic(
-            x,
-            dtype,
-            LinearMMConfig(),
-            scaling_granularity=ScalingGranularity.AXISWISE,
-            axiswise_dim=axiswise_dim,
-        ).to_original_precision()
-        torch.testing.assert_close(out, out_expected, atol=0, rtol=0)
-
     def test_qat_fp8a4w_quantizer(self):
         """
         Test basic model training with `Float8ActInt4WeightQATQuantizer`.
@@ -1693,7 +1685,8 @@ def test_qat_fp8a4w_quantizer(self):
         for linear in [m.linear1, m.sub.linear, m.linear2]:
             self.assertIsInstance(linear, FakeQuantizedLinear)
             self.assertIsInstance(
-                linear.activation_fake_quantizer, _Float8RowwiseActivationFakeQuantizer
+                linear.activation_fake_quantizer,
+                Float8FakeQuantizer,
             )
             self.assertIsInstance(linear.weight_fake_quantizer, IntxFakeQuantizer)
         prev_weight = copy.deepcopy(m.linear1.weight)
@@ -1805,9 +1798,6 @@ def test_qat_api_deprecation(self):
                     str(w.message),
                 )
 
-    @unittest.skipIf(
-        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
-    )
     def test_qat_api_convert_no_quantization(self):
         """
         Test that `QATConfig(step="convert")` swaps back to nn modules without quantization.
@@ -1836,6 +1826,82 @@ def test_qat_api_convert_no_quantization(self):
         baseline_out = baseline_model(*x2)
         torch.testing.assert_close(out, baseline_out, atol=0, rtol=0)
 
+    def test_float8_fake_quantize_config(self):
+        """
+        Test that the correct errors are thrown if `Float8FakeQuantizeConfig` is not instantiated properly.
+        """
+        # OK
+        Float8FakeQuantizeConfig(torch.float8_e4m3fn)
+        Float8FakeQuantizeConfig(torch.float8_e4m3fn, PerRow())
+        Float8FakeQuantizeConfig(torch.float8_e4m3fn, PerTensor())
+
+        with self.assertRaisesRegex(ValueError, "not a float8 dtype"):
+            Float8FakeQuantizeConfig(torch.int8)
+        with self.assertRaisesRegex(
+            ValueError, "Please specify the granularity object instead of the class"
+        ):
+            Float8FakeQuantizeConfig(granularity=PerRow)
+        with self.assertRaisesRegex(
+            ValueError, "Expected PerRow or PerTensor granularity"
+        ):
+            Float8FakeQuantizeConfig(granularity=PerToken())
+
+    @parametrize("granularity", [PerTensor(), PerRow()])
+    def test_float8_fake_quantize(self, granularity: Granularity):
+        """
+        Test that `Float8FakeQuantizer` is numerically close to `Float8Tensor`.
+        """
+        dtype = torch.float8_e4m3fn
+        fq_config = Float8FakeQuantizeConfig(dtype, granularity)
+        fake_quantizer = Float8FakeQuantizer(fq_config)
+        torch.manual_seed(self.SEED)
+        x = torch.randn(32, 64)
+        out = fake_quantizer(x)
+        out_expected = Float8Tensor.to_float8(x, dtype, granularity).dequantize()
+        sqnr = compute_error(out, out_expected)
+        self.assertGreater(sqnr, 16)
+
+    @parametrize("granularity", [PerTensor(), PerRow()])
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
+    def test_quantize_api_fp8_fp8(self, granularity: Granularity):
+        """
+        Test the following:
+            quantize_(model, QATConfig(Float8DynamicActivationFloat8Weight(), step="prepare"))
+            quantize_(model, QATConfig(Float8DynamicActivationFloat8Weight(), step="convert"))
+        """
+        torch.manual_seed(self.SEED)
+        m = M().to(torch.bfloat16).cuda()
+        example_inputs = (m.example_inputs()[0].to(torch.bfloat16).cuda(),)
+        base_config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+        quantize_(m, QATConfig(base_config, step="prepare"))
+        m(*example_inputs)
+        quantize_(m, QATConfig(base_config, step="convert"))
+        m(*example_inputs)
+
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
+    @unittest.skipIf(
+        not _is_fbgemm_genai_gpu_available(), "Requires fbgemm-gpu-genai >= 1.2.0"
+    )
+    def test_quantize_api_fp8_int4(self):
+        """
+        Test the following:
+            quantize_(model, QATConfig(Float8DynamicActivationInt4WeightConfig(), step="prepare"))
+            quantize_(model, QATConfig(Float8DynamicActivationInt4WeightConfig(), step="convert"))
+        """
+        torch.manual_seed(self.SEED)
+        m = M().to(torch.bfloat16).cuda()
+        example_inputs = (m.example_inputs()[0].to(torch.bfloat16).cuda(),)
+        base_config = Float8DynamicActivationInt4WeightConfig(group_size=128)
+        quantize_(m, QATConfig(base_config, step="prepare"))
+        m(*example_inputs)
+        quantize_(m, QATConfig(base_config, step="convert"))
+        m(*example_inputs)
+
+
+instantiate_parametrized_tests(TestQAT)
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -15,11 +15,13 @@
 from .fake_quantize_config import (
     FakeQuantizeConfig,
     FakeQuantizeConfigBase,
+    Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
 )
 from .fake_quantizer import (
     FakeQuantizer,
     FakeQuantizerBase,
+    Float8FakeQuantizer,
     IntxFakeQuantizer,
 )
 from .linear import (
@@ -34,6 +36,8 @@
     "QATStep",
     "FakeQuantizeConfigBase",
     "FakeQuantizerBase",
+    "Float8FakeQuantizeConfig",
+    "Float8FakeQuantizer",
     "IntxFakeQuantizeConfig",
     "IntxFakeQuantizer",
     "FakeQuantizedLinear",
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@`
`19`	`19`	`per_channel_weight_observer_range_neg_127_to_127,`
`20`	`20`	`weight_observer_range_neg_127_to_127,`
`21`	`21`	`)`
`22`		`-from torch.export import export_for_training`
`23`	`22`	`from torch.fx import Node`
`24`	`23`	`from torch.testing._internal.common_quantization import (`
`25`	`24`	`NodeSpec as ns,`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,6 @@`
`18`	`18`	`default_symmetric_qnnpack_qat_qconfig,`
`19`	`19`	`)`
`20`	`20`	`from torch.ao.quantization.quantize_fx import prepare_qat_fx`
`21`		`-from torch.export import export_for_training`
`22`	`21`	`from torch.testing._internal.common_cuda import TEST_CUDA`
`23`	`22`	`from torch.testing._internal.common_quantization import (`
`24`	`23`	`NodeSpec as ns,`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@`
`11`	`11`
`12`	`12`	`import torch`
`13`	`13`	`from torch._higher_order_ops.out_dtype import out_dtype # noqa: F401`
`14`		`-from torch.export import export_for_training`
`15`	`14`	`from torch.testing._internal.common_quantization import (`
`16`	`15`	`NodeSpec as ns,`
`17`	`16`	`)`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,6 @@`
`12`	`12`
`13`	`13`	`import torch`
`14`	`14`	`import torch.nn as nn`
`15`		`-from torch.export import export_for_training`
`16`	`15`	`from torch.testing._internal.common_quantization import (`
`17`	`16`	`NodeSpec as ns,`
`18`	`17`	`)`