[main][bugfix] disable the chunked prefill feature in Non-MLA LLMs

rjg-lyh · rjg-lyh · commit a6fd68c7ca1e · 2025-09-12T18:36:28.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py
@@ -25,7 +25,6 @@
 class AscendSchedulerConfig(SchedulerConfig):
     enable_chunked_prefill: bool = False
     policy: str = "fcfs"
-    num_scheduler_steps: int = 1
     scheduler_cls: Union[str, Type[object]] = (
         "vllm_ascend.core.scheduler.AscendScheduler")
     enable_pd_transfer: bool = False
@@ -44,7 +43,6 @@ def initialize_from_config(
         # Override default values into original SchedulerConfig
         scheduler_config["enable_chunked_prefill"] = False
         scheduler_config["policy"] = "fcfs"
-        scheduler_config["num_scheduler_steps"] = 1
         scheduler_config["scheduler_cls"] = (
             "vllm_ascend.core.scheduler.AscendScheduler")
         scheduler_config["enable_pd_transfer"] = False
@@ -76,9 +74,6 @@ def __post_init__(self) -> None:
         if self.is_multimodal_model:
             raise NotImplementedError(
                 "currently AscendScheduler only supports LLM models.")
-        if self.num_scheduler_steps > 1:
-            raise NotImplementedError(
-                "currently AscendScheduler doesn't support multi-step.")
         if self.send_delta_data:
             raise NotImplementedError(
                 "currently AscendScheduler doesn't support send_delta_data.")
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -128,6 +128,35 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        decoding_config = vllm_config.decoding_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+
+        if model_config is not None and not model_config.use_mla:
+            logger.info(
+                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            if not model_config.is_multimodal_model and \
+                decoding_config.backend == "auto" and \
+                not scheduler_config.delay_factor > 0 and \
+                not scheduler_config.send_delta_data and \
+                scheduler_config.policy == "fcfs":
+                ascend_scheduler_config.enabled = True
+                chunked_prefill_enabled_in_ascend_scheduler = getattr(
+                    ascend_scheduler_config, "enable_chunked_prefill", False)
+                if chunked_prefill_enabled_in_ascend_scheduler:
+                    logger.warning(
+                        "Chunked prefill feature is enabled in ascend_scheduler,"
+                        "but note that the operator supporting this feature "
+                        "would lead to performance degradation.")
+                # In this situation, max_num_batched_tokens would have been rewritten.
+                # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
+                if (scheduler_config.max_num_batched_tokens
+                        < scheduler_config.max_model_len
+                        and not chunked_prefill_enabled_in_ascend_scheduler):
+                    scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
+
         kv_cache_dtype = vllm_config.additional_config.get(
             "kv_cache_dtype", None)
         if kv_cache_dtype is not None: