diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6855526583f0..2ef6db016987 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -383,13 +383,15 @@ def compile_or_warm_up_model(self) -> None: f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " f"config with `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_requested_limit}` to fit into " - f"requested memory, or `--kv-cache-memory=" - f"{kv_cache_memory_bytes_to_gpu_limit}` to fully " + f"{kv_cache_memory_bytes_to_requested_limit}` " + f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit " + f"into requested memory, or `--kv-cache-memory=" + f"{kv_cache_memory_bytes_to_gpu_limit}` " + f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " f"utilize gpu memory. Current kv cache memory in use is " - f"{int(self.available_kv_cache_memory_bytes)} bytes.") + f"{GiB(self.available_kv_cache_memory_bytes)} GiB.") - logger.info(msg) + logger.debug(msg) # Warm up sampler and preallocate memory buffer for logits and other # sampling related tensors of max possible shape to avoid memory