Skip to content

Commit 7c2bdb8

Browse files
[Misc] Clean up utils (#27552)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 9932ed6 commit 7c2bdb8

File tree

12 files changed

+41
-258
lines changed

12 files changed

+41
-258
lines changed

docs/mkdocs/hooks/generate_argparse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ def auto_mock(module, attr, max_mocks=50):
6565
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
6666
cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
6767
run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
68-
FlexibleArgumentParser = auto_mock("vllm.utils", "FlexibleArgumentParser")
68+
FlexibleArgumentParser = auto_mock(
69+
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
70+
)
6971

7072

7173
class MarkdownFormatter(HelpFormatter):

tests/utils.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@
4545
from vllm.model_executor.model_loader import get_model_loader
4646
from vllm.platforms import current_platform
4747
from vllm.transformers_utils.tokenizer import get_tokenizer
48-
from vllm.utils import (
49-
FlexibleArgumentParser,
50-
)
48+
from vllm.utils.argparse_utils import FlexibleArgumentParser
5149
from vllm.utils.mem_constants import GB_bytes
5250
from vllm.utils.network_utils import get_open_port
5351
from vllm.utils.torch_utils import cuda_device_count_stateless

tests/utils_/test_utils.py renamed to tests/utils_/test_argparse_utils.py

Lines changed: 2 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,15 @@
44

55
import json
66
import os
7-
import tempfile
8-
from pathlib import Path
9-
from unittest.mock import patch
107

118
import pytest
12-
import torch
139
import yaml
1410
from transformers import AutoTokenizer
1511

16-
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
1712
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
1813

19-
from vllm.utils import (
20-
FlexibleArgumentParser,
21-
bind_kv_cache,
22-
)
23-
from ..utils import create_new_process_for_each_test, flat_product
14+
from vllm.utils.argparse_utils import FlexibleArgumentParser
15+
from ..utils import flat_product
2416

2517

2618
# Tests for FlexibleArgumentParser
@@ -256,87 +248,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
256248
assert "-O.mode" in caplog_vllm.text
257249

258250

259-
def test_bind_kv_cache():
260-
from vllm.attention import Attention
261-
262-
ctx = {
263-
"layers.0.self_attn": Attention(32, 128, 0.1),
264-
"layers.1.self_attn": Attention(32, 128, 0.1),
265-
"layers.2.self_attn": Attention(32, 128, 0.1),
266-
"layers.3.self_attn": Attention(32, 128, 0.1),
267-
}
268-
kv_cache = [
269-
torch.zeros((1,)),
270-
torch.zeros((1,)),
271-
torch.zeros((1,)),
272-
torch.zeros((1,)),
273-
]
274-
bind_kv_cache(ctx, [kv_cache])
275-
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
276-
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
277-
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
278-
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]
279-
280-
281-
def test_bind_kv_cache_kv_sharing():
282-
from vllm.attention import Attention
283-
284-
ctx = {
285-
"layers.0.self_attn": Attention(32, 128, 0.1),
286-
"layers.1.self_attn": Attention(32, 128, 0.1),
287-
"layers.2.self_attn": Attention(32, 128, 0.1),
288-
"layers.3.self_attn": Attention(32, 128, 0.1),
289-
}
290-
kv_cache = [
291-
torch.zeros((1,)),
292-
torch.zeros((1,)),
293-
torch.zeros((1,)),
294-
torch.zeros((1,)),
295-
]
296-
shared_kv_cache_layers = {
297-
"layers.2.self_attn": "layers.1.self_attn",
298-
"layers.3.self_attn": "layers.0.self_attn",
299-
}
300-
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
301-
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
302-
assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
303-
assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
304-
assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]
305-
306-
307-
def test_bind_kv_cache_non_attention():
308-
from vllm.attention import Attention
309-
310-
# example from Jamba PP=2
311-
ctx = {
312-
"model.layers.20.attn": Attention(32, 128, 0.1),
313-
"model.layers.28.attn": Attention(32, 128, 0.1),
314-
}
315-
kv_cache = [
316-
torch.zeros((1,)),
317-
torch.zeros((1,)),
318-
]
319-
bind_kv_cache(ctx, [kv_cache])
320-
assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
321-
assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
322-
323-
324-
def test_bind_kv_cache_pp():
325-
with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
326-
# this test runs with 1 GPU, but we simulate 2 GPUs
327-
cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
328-
with set_current_vllm_config(cfg):
329-
from vllm.attention import Attention
330-
331-
ctx = {
332-
"layers.0.self_attn": Attention(32, 128, 0.1),
333-
}
334-
kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
335-
bind_kv_cache(ctx, kv_cache)
336-
assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
337-
assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
338-
339-
340251
def test_model_specification(
341252
parser_with_config, cli_config_file, cli_config_file_with_model
342253
):

tests/utils_/test_serial_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
@pytest.mark.parametrize("endianness", ENDIANNESS)
1616
@pytest.mark.parametrize("embed_dtype", EMBED_DTYPE_TO_TORCH_DTYPE.keys())
17-
@torch.inference_mode
17+
@torch.inference_mode()
1818
def test_encode_and_decode(embed_dtype: str, endianness: str):
1919
for i in range(10):
2020
tensor = torch.rand(2, 3, 5, 7, 11, 13, device="cpu", dtype=torch.float32)

vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
)
4343
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
4444
from vllm.logger import init_logger
45-
from vllm.utils import cdiv
45+
from vllm.utils.math_utils import cdiv
4646

4747
if TYPE_CHECKING:
4848
from vllm.attention.backends.abstract import AttentionMetadata

vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@
4444
)
4545
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
4646
from vllm.sampling_params import SamplingParams
47-
from vllm.utils import cdiv, get_kv_cache_torch_dtype
47+
from vllm.utils import get_kv_cache_torch_dtype
48+
from vllm.utils.math_utils import cdiv
4849
from vllm.v1.core.sched.output import SchedulerOutput
4950
from vllm.version import __version__ as VLLM_VERSION
5051

vllm/entrypoints/anthropic/api_server.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@
5151
with_cancellation,
5252
)
5353
from vllm.logger import init_logger
54-
from vllm.utils import FlexibleArgumentParser, set_ulimit
54+
from vllm.utils import set_ulimit
55+
from vllm.utils.argparse_utils import FlexibleArgumentParser
5556
from vllm.utils.network_utils import is_valid_ipv6_address
5657
from vllm.version import __version__ as VLLM_VERSION
5758

vllm/entrypoints/cli/serve.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
1919
from vllm.logger import init_logger
2020
from vllm.usage.usage_lib import UsageContext
21-
from vllm.utils import FlexibleArgumentParser
21+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2222
from vllm.utils.network_utils import get_tcp_uri
2323
from vllm.utils.system_utils import decorate_logs, set_process_title
2424
from vllm.v1.engine.core import EngineCoreProc

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@
108108
from vllm.logger import init_logger
109109
from vllm.reasoning import ReasoningParserManager
110110
from vllm.usage.usage_lib import UsageContext
111-
from vllm.utils import Device, FlexibleArgumentParser, set_ulimit
111+
from vllm.utils import Device, set_ulimit
112+
from vllm.utils.argparse_utils import FlexibleArgumentParser
112113
from vllm.utils.network_utils import is_valid_ipv6_address
113114
from vllm.utils.system_utils import decorate_logs
114115
from vllm.v1.engine.exceptions import EngineDeadError

vllm/lora/punica_wrapper/punica_gpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from vllm.lora.layers import LoRAMapping
1515
from vllm.triton_utils import HAS_TRITON, triton
16-
from vllm.utils import round_up
16+
from vllm.utils.math_utils import round_up
1717

1818
if HAS_TRITON:
1919
from vllm.lora.ops.triton_ops import (

0 commit comments

Comments
 (0)