float8 dynamic autoquant

jainapurva · jainapurva · commit ebcfb9ef7581 · 2024-09-27T14:59:44.000-07:00
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -17,6 +17,7 @@
 )
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_3, TORCH_VERSION_AT_LEAST_2_5
 from torchao.quantization.utils import quantize_activation_per_token_absmax
+from torchao.float8.inference import addmm_float8_unwrapped_inference
 
 import torch.nn.functional as F
 
@@ -518,14 +519,16 @@ def get_per_token_block_size(x):
                             input_float=x,
                             block_size=get_per_token_block_size(x),
                             target_dtype=input_target_dtype,
-                            layout_type=layout_type
+                            layout_type=layout_type,
+                            scale_dtype=torch.float32,
         )
         block_size = get_weight_block_size(weight)
         weight = to_affine_quantized_floatx(
                     input_float=weight,
                     block_size=block_size,
                     target_dtype=target_dtype,
-                    layout_type=layout_type
+                    layout_type=layout_type,
+                    scale_dtype=torch.float32,
         )
         weight = super(AQFloat8DynamicallyQuantizedLinearWeight, cls).from_float(weight, input_quant_func)
         return weight
@@ -555,14 +558,11 @@ def _autoquant_test(cls, act_mat, weight, bias, best_time, mode=["relu", None]):
         x_vals_float8, x_scales = quantize_activation_per_token_absmax(
             act_mat.reshape(-1, act_mat.shape[-1]), dtype=torch.float8_e4m3fn
         )
-        quantized_matmul = (
-            lambda x_vals_float8, x_scales, w_vals_float8:
-                safe_int_mm(x_vals_float8, w_vals_float8) * x_scales
-        )
-        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune-no-cudagraphs")
+        q_c_matmul=torch.compile(addmm_float8_unwrapped_inference, mode="max-autotune-no-cudagraphs")
         with torch.no_grad():
             w_vals_float8 = w_qtensor.original_weight_tensor.layout_tensor.float8_data.contiguous().t()
-            res_matmul = do_autoquant_bench(q_c_matmul, x_vals_float8, x_scales.reshape(-1,1), w_vals_float8)
+            w_scales = w_qtensor.original_weight_tensor.layout_tensor.scale
+            res_matmul = do_autoquant_bench(q_c_matmul, x_vals_float8, x_scales.reshape(-1, 1), w_vals_float8, w_scales.reshape(1, -1), torch.float32)
         print(f">>time: {res_matmul:0.3f}ms for {cls} matmul, to_beat: {best_time:0.3f}ms")
 
         # if the (much faster) matmul kernel is already beat, don't bother benchmarking full op
@@ -586,6 +586,7 @@ def _autoquant_test(cls, act_mat, weight, bias, best_time, mode=["relu", None]):
     # AQInt8WeightOnlyQuantizedLinearWeight3,
     # TODO this gets picked in places where it makes perf worse, why?
     AQInt8DynamicallyQuantizedLinearWeight,
+    AQFloat8DynamicallyQuantizedLinearWeight,
 ]
 
 DEFAULT_INT4_AUTOQUANT_CLASS_LIST = [