[main][bugfix] disable the chunked prefill feature in Non-MLA models

rjg-lyh · rjg-lyh · commit 1d6e56864078 · 2025-08-30T20:57:01.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -127,6 +127,19 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+
+        if not model_config.use_mla:
+            logger.info(
+                "Non-MLA models forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            scheduler_config.enable_chunked_prefill = False
+            scheduler_config.chunked_prefill_enabled = False
+            ascend_scheduler_config.enabled = True
+            if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
+                ascend_scheduler_config.enable_chunked_prefill = False
 
         if parallel_config:
             if parallel_config.enable_expert_parallel: