From 05eaa6ee62e7a9ab03c5588825338a4b88b6afbf Mon Sep 17 00:00:00 2001
From: yewentao256 <zhyanwentao@126.com>
Date: Tue, 24 Jun 2025 14:41:41 +0000
Subject: [PATCH 1/2] remove duplicate ceil_div

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py           | 4 +---
 tests/neuron/1_core/test_prefix_prefill.py                 | 5 ++---
 vllm/attention/ops/nki_flash_attn.py                       | 4 +---
 .../layers/fused_moe/moe_align_block_size.py               | 5 +----
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 ++++---
 5 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index cec422e8d597..ed67e5b50e29 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -17,6 +17,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    ceil_div,
     w8a8_block_fp8_matmul,
 )
 from vllm.utils import FlexibleArgumentParser
@@ -117,9 +118,6 @@ def bench_fp8(
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
 
-    def ceil_div(x: int, y: int) -> int:
-        return (x + y - 1) // y
-
     block_scale_a = torch.rand(
         (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
     )
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 8b9a5f6e4a6a..e07215935711 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -7,6 +7,8 @@
 import torch
 import torch.nn.functional as F
 
+from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
+
 
 class BlockDiagonalCausalFromBottomRightMask:
 
@@ -398,9 +400,6 @@ def test_contexted_kv_attention(
         assert (large_tile_size >= B_P_SIZE
                 ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
 
-        def ceil_div(a, b):
-            return (a + b - 1) // b
-
         def pad_to_multiple(a, b):
             return ceil_div(a, b) * b
 
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index e28ff7e8b4ed..95257458c7ec 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -8,9 +8,7 @@
 from neuronxcc import nki
 from neuronxcc.nki.language import par_dim
 
-
-def ceil_div(a, b):
-    return (a + b - 1) // b
+from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
 
 
 def is_power_of_2(x):
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index f9451ca2fde4..ce1e1de4ff40 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -5,14 +5,11 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
 from vllm.triton_utils import tl, triton
 from vllm.utils import round_up
 
 
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
 @triton.jit
 def moe_align_block_size_stage1(
     topk_ids_ptr,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 754650ebeffb..523ad22de7a2 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -31,6 +31,10 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
 def cutlass_scaled_mm(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -158,9 +162,6 @@ def apply_w8a8_block_fp8_linear(
     if current_platform.is_cuda():
         if current_platform.has_device_capability(100):
 
-            def ceil_div(x: int, y: int) -> int:
-                return (x + y - 1) // y
-
             use_cutlass = cutlass_block_fp8_supported and (
                 ceil_div(weight.shape[0], 128) == weight_scale.shape[0]
                 and ceil_div(weight.shape[1], 128) == weight_scale.shape[1])

From 1adcde5a5e4c48e5038564aa0acc5a306c3f3354 Mon Sep 17 00:00:00 2001
From: yewentao256 <zhyanwentao@126.com>
Date: Tue, 24 Jun 2025 15:15:49 +0000
Subject: [PATCH 2/2] using cdiv

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py    |  9 +++------
 tests/kernels/attention/test_mla_decode_cpu.py      |  5 +----
 .../attention/test_triton_decode_attention.py       |  5 +----
 tests/neuron/1_core/test_prefix_prefill.py          |  6 +++---
 vllm/attention/ops/nki_flash_attn.py                | 13 ++++++-------
 .../layers/fused_moe/moe_align_block_size.py        |  5 ++---
 .../layers/quantization/utils/fp8_utils.py          | 10 +++-------
 7 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index ed67e5b50e29..a5a5b52f6039 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -17,10 +17,9 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    ceil_div,
     w8a8_block_fp8_matmul,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, cdiv
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -118,11 +117,9 @@ def bench_fp8(
     scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
     scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
 
-    block_scale_a = torch.rand(
-        (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
-    )
+    block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
     block_scale_b = torch.rand(
-        ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
+        cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
     )
     block_scale_a_M_major = block_scale_a.t().contiguous().t()
     block_scale_b_K_major = block_scale_b.t().contiguous().t()
diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
index 5a7480a6beae..f8b307c595de 100644
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@@ -7,10 +7,7 @@
 
 import vllm._custom_ops as ops
 from vllm.platforms import current_platform
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv
 
 
 def ref_mla(
diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
index 358b374ea75b..2dca720fe330 100644
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@@ -5,10 +5,7 @@
 import torch
 
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv
 
 
 @pytest.mark.parametrize("B", [3, 5])
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index e07215935711..abf7febc2955 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn.functional as F
 
-from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
+from vllm.utils import cdiv
 
 
 class BlockDiagonalCausalFromBottomRightMask:
@@ -401,7 +401,7 @@ def test_contexted_kv_attention(
                 ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
 
         def pad_to_multiple(a, b):
-            return ceil_div(a, b) * b
+            return cdiv(a, b) * b
 
         def pad_to_next_power_of_2(a):
             assert a > 0
@@ -410,7 +410,7 @@ def pad_to_next_power_of_2(a):
         # calculate input shapes
         max_num_queries = pad_to_next_power_of_2(sum(query_lens))
         context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = cdiv(context_lens, block_size).sum().item()
         num_active_blocks = pad_to_multiple(num_active_blocks,
                                             large_tile_size // block_size)
         context_kv_len = num_active_blocks * block_size
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 95257458c7ec..29fa43201761 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -8,7 +8,7 @@
 from neuronxcc import nki
 from neuronxcc.nki.language import par_dim
 
-from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
+from vllm.utils import cdiv
 
 
 def is_power_of_2(x):
@@ -33,11 +33,10 @@ def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
         (num_tiles, num_blocks_per_tile))
 
     block_tables_sbuf = nl.zeros(
-        (ceil_div(num_tiles,
-                  B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
+        (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
         dtype=nl.int32,
     )
-    for i in nl.affine_range(ceil_div(num_tiles, B_P_SIZE)):
+    for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)):
         i_p = nl.arange(B_P_SIZE)[:, None]
         i_f = nl.arange(num_blocks_per_tile)[None, :]
         block_tables_sbuf[i, i_p, i_f] = nl.load(
@@ -81,7 +80,7 @@ def transform_block_tables_for_indirect_load(
     assert is_power_of_2(
         num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"
 
-    num_loads = ceil_div(num_blocks_per_tile, B_P_SIZE)
+    num_loads = cdiv(num_blocks_per_tile, B_P_SIZE)
     block_tables_transposed = nl.ndarray(
         (
             num_loads,
@@ -163,7 +162,7 @@ def load_kv_tile_from_cache(
            equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
     """
     # load key cache
-    num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+    num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
     for load_idx in nl.affine_range(num_loads):
         i_p = nl.arange(B_P_SIZE)[:, None]
         i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
@@ -603,7 +602,7 @@ def flash_paged_attention(
     )
 
     for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
-        num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+        num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
         cur_k_tile = nl.ndarray(
             (par_dim(B_D_SIZE), LARGE_TILE_SZ),
             dtype=kernel_dtype,
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index ce1e1de4ff40..ceb96add0fde 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -5,9 +5,8 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div
 from vllm.triton_utils import tl, triton
-from vllm.utils import round_up
+from vllm.utils import cdiv, round_up
 
 
 @triton.jit
@@ -112,7 +111,7 @@ def moe_align_block_size_triton(
     cumsum = torch.zeros((num_experts + 1, ),
                          dtype=torch.int32,
                          device=topk_ids.device)
-    tokens_per_thread = ceil_div(numel, num_experts)
+    tokens_per_thread = cdiv(numel, num_experts)
 
     moe_align_block_size_stage1[grid](
         topk_ids,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 523ad22de7a2..3a0fb83d627a 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -19,7 +19,7 @@
     CUTLASS_BLOCK_FP8_SUPPORTED)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils import cdiv, direct_register_custom_op
 
 logger = init_logger(__name__)
 has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
@@ -31,10 +31,6 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
-def ceil_div(x: int, y: int) -> int:
-    return (x + y - 1) // y
-
-
 def cutlass_scaled_mm(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -163,8 +159,8 @@ def apply_w8a8_block_fp8_linear(
         if current_platform.has_device_capability(100):
 
             use_cutlass = cutlass_block_fp8_supported and (
-                ceil_div(weight.shape[0], 128) == weight_scale.shape[0]
-                and ceil_div(weight.shape[1], 128) == weight_scale.shape[1])
+                cdiv(weight.shape[0], 128) == weight_scale.shape[0]
+                and cdiv(weight.shape[1], 128) == weight_scale.shape[1])
         else:
             # TODO: update this after switching to public sm90 block scale gemm
             # as it also supports weight.shape % 128 != 0