[Chore] Separate out vllm.utils.mem_utils (vllm-project#27143)

iAmir97 · iAmir97 · gemini-code-assist[bot] · albertoperdomo2 · commit 39e24e564d7d · 2025-10-23T21:17:36.000+01:00
Signed-off-by: iAmir97 &lt;Amir.balwel@embeddedllm.com&gt;
Signed-off-by: iAmir97 &lt;71513472+iAmir97@users.noreply.github.com&gt;
Co-authored-by: iAmir97 &lt;Amir.balwel@embeddedllm.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
Signed-off-by: Alberto Perdomo &lt;aperdomo@redhat.com&gt;
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
@@ -6,7 +6,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.utils import GiB_bytes
+from vllm.utils.mem_constants import GiB_bytes
 
 from ..utils import create_new_process_for_each_test
 
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
@@ -11,7 +11,7 @@
 from vllm import _custom_ops as ops
 from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.platforms import current_platform
-from vllm.utils import get_max_shared_memory_bytes
+from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
 if not current_platform.is_rocm():
     from xformers import ops as xops
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
@@ -7,7 +7,7 @@
 import pytest
 
 from vllm import LLM
-from vllm.utils import GiB_bytes
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_utils import (
     generate_scheduler_kv_cache_config,
     get_kv_cache_configs,
diff --git a/tests/utils.py b/tests/utils.py
@@ -46,10 +46,10 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import (
     FlexibleArgumentParser,
-    GB_bytes,
     cuda_device_count_stateless,
     get_open_port,
 )
+from vllm.utils.mem_constants import GB_bytes
 
 if current_platform.is_rocm():
     from amdsmi import (
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
@@ -23,7 +23,6 @@
 
 from vllm.utils import (
     FlexibleArgumentParser,
-    MemorySnapshot,
     bind_kv_cache,
     common_broadcastable_dtype,
     current_stream,
@@ -33,13 +32,13 @@
     join_host_port,
     make_zmq_path,
     make_zmq_socket,
-    memory_profiling,
     sha256,
     split_host_port,
     split_zmq_path,
     unique_filepath,
 )
 
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
 from ..utils import create_new_process_for_each_test, flat_product
 
 
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -14,7 +14,8 @@
     PlaceholderRange,
 )
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes, sha256, sha256_cbor
+from vllm.utils import sha256, sha256_cbor
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -13,7 +13,7 @@
 )
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs
 from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 from vllm.v1.worker.tpu_model_runner import (
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -21,7 +21,8 @@
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes, update_environment_variables
+from vllm.utils import update_environment_variables
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs
 from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 from vllm.v1.kv_cache_interface import (
diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -11,7 +11,7 @@
 import torch
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import MemorySnapshot
+from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
 # Global queue to track operation order across processes
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
@@ -10,7 +10,8 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, get_cpu_memory
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import get_cpu_memory
 
 if TYPE_CHECKING:
     from vllm.config.parallel import ParallelConfig
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -81,7 +81,8 @@
     maybe_override_with_speculators,
 )
 from vllm.transformers_utils.utils import check_gguf_file
-from vllm.utils import FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor
+from vllm.utils import FlexibleArgumentParser, get_ip, is_in_ray_actor
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
@@ -17,9 +17,9 @@
     SingleWriterShmRingBuffer,
 )
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, MiB_bytes
 from vllm.utils.cache import CacheInfo, LRUCache
 from vllm.utils.jsontree import json_count_leaves, json_map_leaves, json_reduce_leaves
+from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
 
 from .inputs import (
     MultiModalBatchedField,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -151,7 +151,7 @@ def get_attn_backend_cls(
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         import vllm.envs as envs
-        from vllm.utils import GiB_bytes
+        from vllm.utils.mem_constants import GiB_bytes
 
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
         if kv_cache_space is None:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
diff --git a/vllm/utils/mem_constants.py b/vllm/utils/mem_constants.py
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`)`
`14`	`14`	`from vllm.pooling_params import PoolingParams`
`15`	`15`	`from vllm.sampling_params import SamplingParams`
`16`		`-from vllm.utils import GiB_bytes`
	`16`	`+from vllm.utils.mem_constants import GiB_bytes`
`17`	`17`	`from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs`
`18`	`18`	`from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput`
`19`	`19`	`from vllm.v1.worker.tpu_model_runner import (`