[main][bugfix] disable the chunked prefill feature in Non-MLA models

rjg-lyh · rjg-lyh · commit e34ddcb3c2fc · 2025-09-01T15:19:41.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -39,6 +39,7 @@
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
 from vllm_ascend.utils import adapt_patch  # noqa E402
+from vllm_ascend.ascend_config import clear_ascend_config
 
 adapt_patch(True)
 
@@ -348,6 +349,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         del self.model
+        clear_ascend_config()
         cleanup_dist_env_and_memory()
 
 
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -127,6 +127,25 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+
+        if not model_config.use_mla:
+            logger.info(
+                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            scheduler_config.enable_chunked_prefill = False
+            scheduler_config.chunked_prefill_enabled = False
+            if envs.VLLM_USE_V1 and \
+                not model_config.is_multimodal_model and \
+                not scheduler_config.delay_factor > 0 and \
+                not scheduler_config.send_delta_data and \
+                scheduler_config.policy == "fcfs" and \
+                scheduler_config.num_scheduler_steps == 1:
+                ascend_scheduler_config.enabled = True
+                if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
+                    ascend_scheduler_config.enable_chunked_prefill = False
 
         if parallel_config:
             if parallel_config.enable_expert_parallel: