Fix padded FP4 scaling, and enable its usage in flashinfer_scaled_fp4_mm

roikoren755 · roikoren755 · commit 3298b9106cac · 2025-10-06T10:11:30.000+03:00
Signed-off-by: Roi Koren &lt;roik@nvidia.com&gt;
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1374,7 +1374,7 @@ def scaled_fp4_quant(
     )
 
     # Two fp4 values will be packed into an uint8.
-    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+    output = torch.zeros((m, n // 2), device=device, dtype=torch.uint8)
 
     # We use the rounded values to store the swizzled values. Due to the
     # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
@@ -1385,7 +1385,7 @@ def scaled_fp4_quant(
     rounded_m = round_up(m, 128)
     scale_n = n // block_size
     rounded_n = round_up(scale_n, 4)
-    output_scale = torch.empty(
+    output_scale = torch.zeros(
         (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
     )
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
@@ -384,8 +384,6 @@ def flashinfer_scaled_fp4_mm(
     assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
     assert a.stride(-1) == 1 and b.stride(-1) == 1
     assert a.shape[1] == b.shape[1]
-    assert block_scale_a.shape[1] == a.shape[1] // 8
-    assert block_scale_b.shape[1] == b.shape[1] // 8
 
     if backend == "cutlass":
         block_scale_a = block_scale_a.view(torch.uint8)

Original file line number	Diff line number	Diff line change
`@@ -1374,7 +1374,7 @@ def scaled_fp4_quant(`
`1374`	`1374`	`)`
`1375`	`1375`
`1376`	`1376`	`# Two fp4 values will be packed into an uint8.`
`1377`		`- output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)`
	`1377`	`+ output = torch.zeros((m, n // 2), device=device, dtype=torch.uint8)`
`1378`	`1378`
`1379`	`1379`	`# We use the rounded values to store the swizzled values. Due to the`
`1380`	`1380`	`# requirement of the Tensor Core, the minimum tile is 128x4 for the scales.`
`@@ -1385,7 +1385,7 @@ def scaled_fp4_quant(`
`1385`	`1385`	`rounded_m = round_up(m, 128)`
`1386`	`1386`	`scale_n = n // block_size`
`1387`	`1387`	`rounded_n = round_up(scale_n, 4)`
`1388`		`- output_scale = torch.empty(`
	`1388`	`+ output_scale = torch.zeros(`
`1389`	`1389`	`(rounded_m, rounded_n // 4), device=device, dtype=torch.int32`
`1390`	`1390`	`)`
`1391`	`1391`