[Misc] Clean up utils (#27552)

DarkLight1337 · web-flow · commit 7c2bdb83dca6 · 2025-10-27T09:05:40.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
@@ -65,7 +65,9 @@ def auto_mock(module, attr, max_mocks=50):
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
+FlexibleArgumentParser = auto_mock(
+    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+)
 
 
 class MarkdownFormatter(HelpFormatter):
diff --git a/tests/utils.py b/tests/utils.py
@@ -45,9 +45,7 @@
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import (
-    FlexibleArgumentParser,
-)
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GB_bytes
 from vllm.utils.network_utils import get_open_port
 from vllm.utils.torch_utils import cuda_device_count_stateless
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
@@ -4,23 +4,15 @@
 
 import json
 import os
-import tempfile
-from pathlib import Path
-from unittest.mock import patch
 
 import pytest
-import torch
 import yaml
 from transformers import AutoTokenizer
 
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
 
-from vllm.utils import (
-    FlexibleArgumentParser,
-    bind_kv_cache,
-)
-from ..utils import create_new_process_for_each_test, flat_product
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from ..utils import flat_product
 
 
 # Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     assert "-O.mode" in caplog_vllm.text
 
 
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
-
-
-def test_bind_kv_cache_kv_sharing():
-    from vllm.attention import Attention
-
-    ctx = {
-        "layers.0.self_attn": Attention(32, 128, 0.1),
-        "layers.1.self_attn": Attention(32, 128, 0.1),
-        "layers.2.self_attn": Attention(32, 128, 0.1),
-        "layers.3.self_attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    shared_kv_cache_layers = {
-        "layers.2.self_attn": "layers.1.self_attn",
-        "layers.3.self_attn": "layers.0.self_attn",
-    }
-    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
-    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
-    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        "model.layers.20.attn": Attention(32, 128, 0.1),
-        "model.layers.28.attn": Attention(32, 128, 0.1),
-    }
-    kv_cache = [
-        torch.zeros((1,)),
-        torch.zeros((1,)),
-    ]
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
-    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
-
-
-def test_bind_kv_cache_pp():
-    with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
-        # this test runs with 1 GPU, but we simulate 2 GPUs
-        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
-    with set_current_vllm_config(cfg):
-        from vllm.attention import Attention
-
-        ctx = {
-            "layers.0.self_attn": Attention(32, 128, 0.1),
-        }
-        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
-        bind_kv_cache(ctx, kv_cache)
-        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
-        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
-
-
 def test_model_specification(
     parser_with_config, cli_config_file, cli_config_file_with_model
 ):
diff --git a/tests/utils_/test_serial_utils.py b/tests/utils_/test_serial_utils.py
@@ -14,7 +14,7 @@
 
 @pytest.mark.parametrize("endianness", ENDIANNESS)
 @pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
-@torch.inference_mode
+@torch.inference_mode()
 def test_encode_and_decode(embed_dtype: str, endianness: str):
     for i in range(10):
         tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -42,7 +42,7 @@
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -44,7 +44,8 @@
 )
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv, get_kv_cache_torch_dtype
+from vllm.utils import get_kv_cache_torch_dtype
+from vllm.utils.math_utils import cdiv
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.version import __version__ as VLLM_VERSION
 
diff --git a/vllm/entrypoints/anthropic/api_server.py b/vllm/entrypoints/anthropic/api_server.py
@@ -51,7 +51,8 @@
     with_cancellation,
 )
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser, set_ulimit
+from vllm.utils import set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.version import __version__ as VLLM_VERSION
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
@@ -18,7 +18,7 @@
 from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_tcp_uri
 from vllm.utils.system_utils import decorate_logs, set_process_title
 from vllm.v1.engine.core import EngineCoreProc
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -108,7 +108,8 @@
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, FlexibleArgumentParser, set_ulimit
+from vllm.utils import Device, set_ulimit
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import is_valid_ipv6_address
 from vllm.utils.system_utils import decorate_logs
 from vllm.v1.engine.exceptions import EngineDeadError
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -13,7 +13,7 @@
 
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON, triton
-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up
 
 if HAS_TRITON:
     from vllm.lora.ops.triton_ops import (
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -48,9 +48,9 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
-from vllm.utils import round_up
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`	`)`
`43`	`43`	`from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata`
`44`	`44`	`from vllm.logger import init_logger`
`45`		`-from vllm.utils import cdiv`
	`45`	`+from vllm.utils.math_utils import cdiv`
`46`	`46`
`47`	`47`	`if TYPE_CHECKING:`
`48`	`48`	`from vllm.attention.backends.abstract import AttentionMetadata`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,8 @@`
`44`	`44`	`)`
`45`	`45`	`from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group`
`46`	`46`	`from vllm.sampling_params import SamplingParams`
`47`		`-from vllm.utils import cdiv, get_kv_cache_torch_dtype`
	`47`	`+from vllm.utils import get_kv_cache_torch_dtype`
	`48`	`+from vllm.utils.math_utils import cdiv`
`48`	`49`	`from vllm.v1.core.sched.output import SchedulerOutput`
`49`	`50`	`from vllm.version import __version__ as VLLM_VERSION`
`50`	`51`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,8 @@`
`51`	`51`	`with_cancellation,`
`52`	`52`	`)`
`53`	`53`	`from vllm.logger import init_logger`
`54`		`-from vllm.utils import FlexibleArgumentParser, set_ulimit`
	`54`	`+from vllm.utils import set_ulimit`
	`55`	`+from vllm.utils.argparse_utils import FlexibleArgumentParser`
`55`	`56`	`from vllm.utils.network_utils import is_valid_ipv6_address`
`56`	`57`	`from vllm.version import __version__ as VLLM_VERSION`
`57`	`58`