support select.int for Float8Tensor

vkuzo · vkuzo · commit 95f42d04824b · 2025-09-24T06:51:48.000-07:00
Summary:

This is useful for stitching together 2D weights to a 3D weight,
specifically this happens in vLLM for HF models where expert weights are
2D.

Test Plan:

```bash
pytest test/quantization/quantize_/workflows/float8/test_float8_tensor.py -x -s -k index
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -25,6 +25,7 @@
     quantize_,
 )
 from torchao.quantization.quantize_.common import KernelPreference
+from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import (
@@ -446,6 +447,23 @@ def test_expected_gpu_kernel_fbgemm(self):
             ".run("
         ).run(code[0])
 
+    @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    def test_index_select(self):
+        """
+        test that `x_0 = x[0]` works when `x` is a 3D `Float8Tensor`. This is
+        useful when stitching checkpoints of `num_experts` 2D parameters into
+        a single 3D parameter when converting between model definitions that
+        use 2D and 3D parameters for their expert weights.
+        """
+
+        E, K, N = 128, 256, 512
+        x = torch.randn(E, N, K, device="cuda", dtype=torch.bfloat16)
+        x_fp8 = Float8Tensor.from_hp(x)
+        x_fp8_1 = x_fp8[1]
+        torch.testing.assert_close(
+            x_fp8.dequantize()[1], x_fp8_1.dequantize(), atol=0, rtol=0
+        )
+
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -617,6 +617,22 @@ def _(func, types, args, kwargs):
     return return_and_correct_aliasing(func, args, kwargs, new)
 
 
+@implements(aten.select.int)
+def _(func, types, args, kwargs):
+    old_float8_tensor, dim, index = args
+    assert dim == 0, f"Float8Tensor aten.select.int with {dim=} is not yet supported"
+    new_float8_tensor = old_float8_tensor.__class__(
+        old_float8_tensor.qdata[index],
+        old_float8_tensor.scale[index],
+        old_float8_tensor.block_size[1:],
+        old_float8_tensor.mm_config,
+        old_float8_tensor.act_quant_kwargs,
+        old_float8_tensor.kernel_preference,
+        old_float8_tensor.dtype,
+    )
+    return return_and_correct_aliasing(func, args, kwargs, new_float8_tensor)
+
+
 Float8Tensor.__module__ = "torchao.quantization"
 
 # Allow a model with Float8Tensor weights to be loaded with `weights_only=True`