pytorch · jainapurva · Aug 28, 2024 · Aug 13, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -8,6 +8,7 @@
     int8_dynamic_activation_int4_weight,
     int8_dynamic_activation_int8_weight,
     int8_dynamic_activation_int8_semi_sparse_weight,
+    float8_weight_only,
 )
 from torchao.dtypes import (
     to_affine_quantized,
@@ -18,6 +19,7 @@
 import unittest
 import tempfile
 
+
 class TestAffineQuantized(TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_tensor_core_layout_transpose(self):
@@ -40,7 +42,8 @@ def test_tensor_core_layout_transpose(self):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_weights_only(self):
-        for apply_quant in [int4_weight_only(group_size=32), int8_weight_only(), int8_dynamic_activation_int4_weight(), int8_dynamic_activation_int8_weight(), int8_dynamic_activation_int8_semi_sparse_weight()]:
+        for apply_quant in [int4_weight_only(group_size=32), int8_weight_only(), int8_dynamic_activation_int4_weight(),
+                            int8_dynamic_activation_int8_weight(), int8_dynamic_activation_int8_semi_sparse_weight(), float8_weight_only()]:
             l = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
             ql = apply_quant(l)
             with tempfile.NamedTemporaryFile() as f:
@@ -69,6 +72,5 @@ def test_to_device(self):
             ql.cuda()
 
 
-
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -5,6 +5,7 @@
     AffineQuantizedTensor,
     to_affine_quantized,
     to_affine_quantized_static,
+    to_affine_quantized_floatx,
     LayoutType,
     PlainLayoutType,
     SemiSparseLayoutType,
@@ -18,6 +19,7 @@
     "AffineQuantizedTensor",
     "to_affine_quantized",
     "to_affine_quantized_static",
+    "to_affine_quantized_floatx",
     "LayoutType",
     "PlainLayoutType",
     "SemiSparseLayoutType",

diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -11,6 +11,7 @@
     MappingType,
     int_scaled_matmul,
     quantize_affine_hqq,
+    FP8_TYPES,
 )
 from torchao.quantization.utils import (
     pack_tinygemm_scales_and_zeros,
@@ -36,7 +37,6 @@
 
 aten = torch.ops.aten
 
-
 ###############################
 # Base Layout Tensor Subclass #
 ###############################
@@ -91,7 +91,7 @@ class AffineQuantizedTensor(TorchAOBaseTensor):
       shape (torch.Size): the shape for the Tensor
       quant_min (Optional[int]): minimum quantized value for the Tensor, if not specified, it will be derived from dtype of `int_data`
       quant_max (Optional[int]): maximum quantized value for the Tensor, if not specified, it will be derived from dtype of `int_data`
-      zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float
+      zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float
         if zero_point is in integer domain, zero point is added to the quantized integer value during
         quantization
         if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized)
@@ -260,6 +260,33 @@ def from_float_static(
             dtype=input_float.dtype,
         )
 
+    @classmethod
+    def from_float_to_floatx(
+        cls,
+        input_float: torch.Tensor,
+        block_size: Tuple[int, ...],
+        target_dtype: torch.dtype = torch.float8_e4m3fn,
+        layout_type: LayoutType = PlainLayoutType(),
+    ):
+        if target_dtype in FP8_TYPES:
+            cls.from_float(
+                input_float=input_float,
+                mapping_type=MappingType.SYMMETRIC,
+                block_size=block_size,
+                target_dtype=target_dtype,
+                quant_min=math.ceil(torch.finfo(target_dtype).min),
+                quant_max=math.ceil(torch.finfo(target_dtype).max),
+                eps=torch.finfo(torch.float32).eps,
+                scale_dtype=None,
+                zero_point_dtype=None,
+                preserve_zero=True,
+                zero_point_domain=ZeroPointDomain.INT,
+                layout_type=PlainLayoutType(),
+                use_hqq=False,
+            )
+        else:
+            raise NotImplementedError(f"Unsupported dtype {target_dtype} for from_float_to_floatx")
+
     @property
     def layout_type(self) -> LayoutType:
         return self.layout_tensor.layout_type
@@ -974,6 +1001,7 @@ def _(func, types, args, kwargs):
 
 to_affine_quantized = AffineQuantizedTensor.from_float
 to_affine_quantized_static = AffineQuantizedTensor.from_float_static
+to_affine_quantized_floatx = AffineQuantizedTensor.from_float_to_floatx
 
 if TORCH_VERSION_AT_LEAST_2_5:
     # Allow a model with AffineQuantizedTensor weights to be loaded with `weights_only=True`

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -27,7 +27,8 @@
     TensorCoreTiledLayoutType, 
     PlainLayoutType,
     AffineQuantizedTensor,
-    SemiSparseLayoutType
+    SemiSparseLayoutType,
+    to_affine_quantized_floatx
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_4,
@@ -57,7 +58,6 @@
 import logging
 from .autoquant import autoquant, AutoQuantizableLinearWeight
 
-
 __all__ = [
     "swap_conv2d_1x1_to_linear",
     "Quantizer",
@@ -72,6 +72,7 @@
     "int8_dynamic_activation_int8_semi_sparse_weight",
     "int4_weight_only",
     "int8_weight_only",
+    "float8_weight_only",
 ]
 
 from .GPTQ import (
@@ -488,6 +489,18 @@ def int8_dynamic_activation_int8_semi_sparse_weight():
     """
     return int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType())
 
+def float8_weight_only(target_dtype: torch.dtype = torch.float8_e4m3fn):
+    """
+    Applies float8 weight-only symmetric per-channel quantization to linear layers.
+    """
+    from torchao.dtypes import to_affine_quantized_floatx
+    def apply_float8wo_quant(weight):
+        # avoid circular dep
+        block_size = (1, weight.shape[1])
+        return to_affine_quantized_floatx(input_float=weight, block_size=block_size, target_dtype=target_dtype)
+
+    return _get_linear_subclass_inserter(apply_float8wo_quant)
+
 
 def uintx_weight_only(dtype, group_size=64, pack_dim=-1):
     """

diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -8,7 +8,6 @@
 from typing import List, Optional, Tuple, Dict, Callable, Union
 import torch, math
 
-
 from torchao.kernel.intmm import int_scaled_matmul
 from torchao.kernel.intmm import safe_int_mm
 from torchao.utils import (
@@ -58,6 +57,13 @@ class ZeroPointDomain(Enum):
 if TORCH_VERSION_AT_LEAST_2_5:
     torch.serialization.add_safe_globals([MappingType, ZeroPointDomain])
 
+FP8_TYPES = {
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e4m3fnuz,
+    torch.float8_e5m2fnuz,
+}
+
 """
 Map from dtype to the bound value of integers
 TODO: maybe can replace this with call to torch.iinfo
@@ -95,9 +101,12 @@ def _get_and_check_qmin_qmax(dtype, quant_min, quant_max):
     verify that they are within the range of possible quant_min/quant_max
     for dtype
     """
-    if dtype not in _DTYPE_TO_QVALUE_BOUNDS:
+    if dtype in FP8_TYPES:
+        quant_min_lower_bound, quant_max_upper_bound = torch.finfo(dtype).min, torch.finfo(dtype).max
+    elif dtype not in _DTYPE_TO_QVALUE_BOUNDS:
         raise ValueError(f"Unsupported dtype: {dtype}")
-    quant_min_lower_bound, quant_max_upper_bound = _DTYPE_TO_QVALUE_BOUNDS[dtype]
+    else:
+        quant_min_lower_bound, quant_max_upper_bound = _DTYPE_TO_QVALUE_BOUNDS[dtype]
     if quant_min is None:
         quant_min = quant_min_lower_bound
     if quant_max is None: