diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 0d58d4d2218f..13a3756fdacb 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -488,9 +488,9 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, if needed_memory > available_memory: raise ValueError( f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV " + f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV " f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory/1024/1024/1024:.2f} GB). Try " + f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try " f"increasing `gpu_memory_utilization` or decreasing " f"`max_model_len` when initializing the engine.") diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e3df2a62e67f..209c6ef50d17 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -24,8 +24,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - LayerBlockType, LazyLoader, cdiv, check_use_alibi, - is_pin_memory_available) + GiB_bytes, LayerBlockType, LazyLoader, cdiv, + check_use_alibi, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -1195,8 +1195,8 @@ def load_model(self) -> None: self.device) time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Model loading took %.4f GB and %.6f seconds", - self.model_memory_usage / float(2**30), + logger.info("Model loading took %.4f GiB and %.6f seconds", + self.model_memory_usage / GiB_bytes, time_after_load - time_before_load) def _get_prompt_logprobs_dict( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index edbafb48c938..86e6d9752013 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1143,8 +1143,8 @@ def load_model(self) -> None: time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Model loading took %.4f GB and %.6f seconds", - self.model_memory_usage / float(2**30), + logger.info("Model loading took %.4f GiB and %.6f seconds", + self.model_memory_usage / GiB_bytes, time_after_load - time_before_load) if self.prompt_adapter_config: self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager( diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 39957e661c47..9d49b4385dca 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -25,7 +25,7 @@ MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad +from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, @@ -422,8 +422,8 @@ def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB", - self.model_memory_usage / float(2**30)) + logger.info("Loading model weights took %.4f GiB", + self.model_memory_usage / GiB_bytes) def get_model(self) -> nn.Module: return self.model