From db35f17328a25b2fb79e9d006ce9cac47d7762b5 Mon Sep 17 00:00:00 2001 From: rzou Date: Fri, 11 Apr 2025 07:50:26 -0700 Subject: [PATCH] Fix vLLM x torch.compile config caching Fixes https://github.com/vllm-project/vllm/issues/16150 Based on the ModelConfig, we decide if we can reuse an existing torch.compile'd artifact or if we need to recompile. Unfortunately we were not checking enough flags on the config. The problem in #16150 was specifically that if the override_generation_config flag changed then we need to recompile. I went through ModelConfig and I added some more things to be checked for if a model needs to recompile. Disclaimer: I do not know what a lot of these things to do, but I figure that it is better to add things than not (we risk silent incorrectness if the caching is wrong). We can remove more things if we are compiling too much. This is also one of the reasons the PyTorch Team recommend that vLLM use torch.compile's built-in caching (when we improve it), because torch.compile programmatically decides what needs to be cached and we test that really well. Test Plan: - tested locally Signed-off-by: rzou --- vllm/config.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d24082799d00..4ed481506cdd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -294,12 +294,18 @@ def compute_hash(self) -> str: factors.append(self.quantization) factors.append(self.revision) factors.append(self.code_revision) + factors.append(self.max_model_len) + factors.append(self.max_logprobs) + factors.append(self.disable_sliding_window) factors.append(self.trust_remote_code) + factors.append(self.mm_processor_kwargs) + factors.append(self.generation_config) + factors.append(self.model_impl) + factors.append(self.override_generation_config) factors.append(self.rope_scaling) factors.append(self.rope_theta) - # rope cos/sin cache depends on the max_position_embeddings - factors.append( - getattr(self.hf_config, "max_position_embeddings", "None")) + # hf_config can control how the model looks! + factors.append(self.hf_config.to_json_string()) return hashlib.sha256(str(factors).encode()).hexdigest() def __init__(