From 05eaa6ee62e7a9ab03c5588825338a4b88b6afbf Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Tue, 24 Jun 2025 14:41:41 +0000 Subject: [PATCH 1/2] remove duplicate ceil_div Signed-off-by: yewentao256 --- benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 4 +--- tests/neuron/1_core/test_prefix_prefill.py | 5 ++--- vllm/attention/ops/nki_flash_attn.py | 4 +--- .../layers/fused_moe/moe_align_block_size.py | 5 +---- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 ++++--- 5 files changed, 9 insertions(+), 16 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index cec422e8d597..ed67e5b50e29 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -17,6 +17,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + ceil_div, w8a8_block_fp8_matmul, ) from vllm.utils import FlexibleArgumentParser @@ -117,9 +118,6 @@ def bench_fp8( scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - def ceil_div(x: int, y: int) -> int: - return (x + y - 1) // y - block_scale_a = torch.rand( (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 ) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 8b9a5f6e4a6a..e07215935711 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -7,6 +7,8 @@ import torch import torch.nn.functional as F +from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div + class BlockDiagonalCausalFromBottomRightMask: @@ -398,9 +400,6 @@ def test_contexted_kv_attention( assert (large_tile_size >= B_P_SIZE ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - def ceil_div(a, b): - return (a + b - 1) // b - def pad_to_multiple(a, b): return ceil_div(a, b) * b diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index e28ff7e8b4ed..95257458c7ec 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -8,9 +8,7 @@ from neuronxcc import nki from neuronxcc.nki.language import par_dim - -def ceil_div(a, b): - return (a + b - 1) // b +from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div def is_power_of_2(x): diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index f9451ca2fde4..ce1e1de4ff40 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -5,14 +5,11 @@ import torch from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div from vllm.triton_utils import tl, triton from vllm.utils import round_up -def ceil_div(a, b): - return (a + b - 1) // b - - @triton.jit def moe_align_block_size_stage1( topk_ids_ptr, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 754650ebeffb..523ad22de7a2 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -31,6 +31,10 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz +def ceil_div(x: int, y: int) -> int: + return (x + y - 1) // y + + def cutlass_scaled_mm( A: torch.Tensor, B: torch.Tensor, @@ -158,9 +162,6 @@ def apply_w8a8_block_fp8_linear( if current_platform.is_cuda(): if current_platform.has_device_capability(100): - def ceil_div(x: int, y: int) -> int: - return (x + y - 1) // y - use_cutlass = cutlass_block_fp8_supported and ( ceil_div(weight.shape[0], 128) == weight_scale.shape[0] and ceil_div(weight.shape[1], 128) == weight_scale.shape[1]) From 1adcde5a5e4c48e5038564aa0acc5a306c3f3354 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Tue, 24 Jun 2025 15:15:49 +0000 Subject: [PATCH 2/2] using cdiv Signed-off-by: yewentao256 --- benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 9 +++------ tests/kernels/attention/test_mla_decode_cpu.py | 5 +---- .../attention/test_triton_decode_attention.py | 5 +---- tests/neuron/1_core/test_prefix_prefill.py | 6 +++--- vllm/attention/ops/nki_flash_attn.py | 13 ++++++------- .../layers/fused_moe/moe_align_block_size.py | 5 ++--- .../layers/quantization/utils/fp8_utils.py | 10 +++------- 7 files changed, 19 insertions(+), 34 deletions(-) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index ed67e5b50e29..a5a5b52f6039 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -17,10 +17,9 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - ceil_div, w8a8_block_fp8_matmul, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -118,11 +117,9 @@ def bench_fp8( scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) - block_scale_a = torch.rand( - (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32 - ) + block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32) block_scale_b = torch.rand( - ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32 + cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32 ) block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t() diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py index 5a7480a6beae..f8b307c595de 100644 --- a/tests/kernels/attention/test_mla_decode_cpu.py +++ b/tests/kernels/attention/test_mla_decode_cpu.py @@ -7,10 +7,7 @@ import vllm._custom_ops as ops from vllm.platforms import current_platform - - -def cdiv(a, b): - return (a + b - 1) // b +from vllm.utils import cdiv def ref_mla( diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index 358b374ea75b..2dca720fe330 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -5,10 +5,7 @@ import torch from vllm.attention.ops.triton_decode_attention import decode_attention_fwd - - -def cdiv(a, b): - return (a + b - 1) // b +from vllm.utils import cdiv @pytest.mark.parametrize("B", [3, 5]) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index e07215935711..abf7febc2955 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F -from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div +from vllm.utils import cdiv class BlockDiagonalCausalFromBottomRightMask: @@ -401,7 +401,7 @@ def test_contexted_kv_attention( ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" def pad_to_multiple(a, b): - return ceil_div(a, b) * b + return cdiv(a, b) * b def pad_to_next_power_of_2(a): assert a > 0 @@ -410,7 +410,7 @@ def pad_to_next_power_of_2(a): # calculate input shapes max_num_queries = pad_to_next_power_of_2(sum(query_lens)) context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = ceil_div(context_lens, block_size).sum().item() + num_active_blocks = cdiv(context_lens, block_size).sum().item() num_active_blocks = pad_to_multiple(num_active_blocks, large_tile_size // block_size) context_kv_len = num_active_blocks * block_size diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 95257458c7ec..29fa43201761 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -8,7 +8,7 @@ from neuronxcc import nki from neuronxcc.nki.language import par_dim -from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div +from vllm.utils import cdiv def is_power_of_2(x): @@ -33,11 +33,10 @@ def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile): (num_tiles, num_blocks_per_tile)) block_tables_sbuf = nl.zeros( - (ceil_div(num_tiles, - B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile), + (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile), dtype=nl.int32, ) - for i in nl.affine_range(ceil_div(num_tiles, B_P_SIZE)): + for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)): i_p = nl.arange(B_P_SIZE)[:, None] i_f = nl.arange(num_blocks_per_tile)[None, :] block_tables_sbuf[i, i_p, i_f] = nl.load( @@ -81,7 +80,7 @@ def transform_block_tables_for_indirect_load( assert is_power_of_2( num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2" - num_loads = ceil_div(num_blocks_per_tile, B_P_SIZE) + num_loads = cdiv(num_blocks_per_tile, B_P_SIZE) block_tables_transposed = nl.ndarray( ( num_loads, @@ -163,7 +162,7 @@ def load_kv_tile_from_cache( equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE) """ # load key cache - num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE) + num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) for load_idx in nl.affine_range(num_loads): i_p = nl.arange(B_P_SIZE)[:, None] i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :] @@ -603,7 +602,7 @@ def flash_paged_attention( ) for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile): - num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE) + num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE) cur_k_tile = nl.ndarray( (par_dim(B_D_SIZE), LARGE_TILE_SZ), dtype=kernel_dtype, diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index ce1e1de4ff40..ceb96add0fde 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -5,9 +5,8 @@ import torch from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.fp8_utils import ceil_div from vllm.triton_utils import tl, triton -from vllm.utils import round_up +from vllm.utils import cdiv, round_up @triton.jit @@ -112,7 +111,7 @@ def moe_align_block_size_triton( cumsum = torch.zeros((num_experts + 1, ), dtype=torch.int32, device=topk_ids.device) - tokens_per_thread = ceil_div(numel, num_experts) + tokens_per_thread = cdiv(numel, num_experts) moe_align_block_size_stage1[grid]( topk_ids, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 523ad22de7a2..3a0fb83d627a 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -19,7 +19,7 @@ CUTLASS_BLOCK_FP8_SUPPORTED) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import direct_register_custom_op +from vllm.utils import cdiv, direct_register_custom_op logger = init_logger(__name__) has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None @@ -31,10 +31,6 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool: return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz -def ceil_div(x: int, y: int) -> int: - return (x + y - 1) // y - - def cutlass_scaled_mm( A: torch.Tensor, B: torch.Tensor, @@ -163,8 +159,8 @@ def apply_w8a8_block_fp8_linear( if current_platform.has_device_capability(100): use_cutlass = cutlass_block_fp8_supported and ( - ceil_div(weight.shape[0], 128) == weight_scale.shape[0] - and ceil_div(weight.shape[1], 128) == weight_scale.shape[1]) + cdiv(weight.shape[0], 128) == weight_scale.shape[0] + and cdiv(weight.shape[1], 128) == weight_scale.shape[1]) else: # TODO: update this after switching to public sm90 block scale gemm # as it also supports weight.shape % 128 != 0