From db35f17328a25b2fb79e9d006ce9cac47d7762b5 Mon Sep 17 00:00:00 2001
From: rzou <zou3519@gmail.com>
Date: Fri, 11 Apr 2025 07:50:26 -0700
Subject: [PATCH] Fix vLLM x torch.compile config caching

Fixes https://github.com/vllm-project/vllm/issues/16150

Based on the ModelConfig, we decide if we can reuse an existing
torch.compile'd artifact or if we need to recompile. Unfortunately we
were not checking enough flags on the config.

The problem in #16150 was specifically that if the
override_generation_config flag changed then we need to recompile.

I went through ModelConfig and I added some more things to be checked
for if a model needs to recompile. Disclaimer: I do not know what a lot
of these things to do, but I figure that it is better to add things
than not (we risk silent incorrectness if the caching is wrong).
We can remove more things if we are compiling too much.

This is also one of the reasons the PyTorch Team recommend that vLLM use
torch.compile's built-in caching (when we improve it), because torch.compile
programmatically decides what needs to be cached and we test that really
well.

Test Plan:
- tested locally

Signed-off-by: rzou <zou3519@gmail.com>
---
 vllm/config.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d24082799d00..4ed481506cdd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -294,12 +294,18 @@ def compute_hash(self) -> str:
         factors.append(self.quantization)
         factors.append(self.revision)
         factors.append(self.code_revision)
+        factors.append(self.max_model_len)
+        factors.append(self.max_logprobs)
+        factors.append(self.disable_sliding_window)
         factors.append(self.trust_remote_code)
+        factors.append(self.mm_processor_kwargs)
+        factors.append(self.generation_config)
+        factors.append(self.model_impl)
+        factors.append(self.override_generation_config)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
-        # rope cos/sin cache depends on the max_position_embeddings
-        factors.append(
-            getattr(self.hf_config, "max_position_embeddings", "None"))
+        # hf_config can control how the model looks!
+        factors.append(self.hf_config.to_json_string())
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __init__(