[wip] enable 3d weights for NVFP4Tensor

vkuzo · vkuzo · commit fed1f442a0a5 · 2025-10-01T12:53:02.000-07:00
Summary: doesn't work yet, stay tuned this is needed for vLLM stitching 2d weights into a 3d weight for MoEs Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 9f4b94d ghstack-comment-id: 3357908175 Pull-Request: #3109
diff --git a/test/prototype/mx_formats/test_inference_workflow.py b/test/prototype/mx_formats/test_inference_workflow.py
@@ -218,3 +218,13 @@ def test_narrow_similar_to_vllm(self):
             gemm_kernel_choice=MXGemmKernelChoice.EMULATED,
         )
         self._test_narrow_similar_to_vllm(config)
+
+    # TODO(next): make this test pass by enabling 3d NVFP4Tensor, currently a lot
+    # of places hardcode 2d
+    def test_nvfp4_quantize_3d_param_similar_to_vllm(self):
+        config = NVFP4InferenceConfig(
+            mm_config=NVFP4MMConfig.WEIGHT_ONLY,
+            use_triton_kernel=False,
+            use_dynamic_per_tensor_scale=False,
+        )
+        self._test_quantize_3d_param_similar_to_vllm(config)
diff --git a/torchao/testing/utils.py b/torchao/testing/utils.py
@@ -625,6 +625,18 @@ def _test_narrow_similar_to_vllm(self, config: AOBaseConfig):
                 f"shape mismatch: {orig_attr.shape} vs {new_attr.shape}"
             )
 
+    def _test_quantize_3d_param_similar_to_vllm(self, config: AOBaseConfig):
+        # this happens when vLLM loads empty MoE weights and quantizes
+        # them
+
+        dtype = torch.bfloat16
+        with torch.device("meta"):
+            l = torch.nn.Linear(1024, 1024, device="cuda", dtype=dtype)
+        l.weight = torch.nn.Parameter(
+            torch.randn(60, 2816, 2048, device="cuda", dtype=dtype)
+        )
+        quantize_(l, config)
+
 
 common_utils.instantiate_parametrized_tests(TorchAOBasicTestCase)
 common_utils.instantiate_parametrized_tests(TorchAOCompileTestCase)