Disable auto enabling chunked prefill on ROCm platform on long contexts due to poor performance (#324)

gshtras · web-flow · commit 405e730c4141 · 2024-12-12T15:13:12.000-05:00
Signed-off-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1063,7 +1063,8 @@ def create_engine_config(self,
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and model_config.task != "embedding"):
+                        and model_config.task != "embedding"
+                        and not current_platform.is_rocm()):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "