[main][bugfix] disable the chunked prefill feature in Non-MLA models

rjg-lyh · rjg-lyh · commit 05238449673b · 2025-09-01T16:12:35.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/docs/source/tutorials/large_scale_ep.md b/docs/source/tutorials/large_scale_ep.md
@@ -345,7 +345,7 @@ for process in processes:
 
 :::::
 
-Note that the prefiller nodes and the decoder nodes may have differenet configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to differents in 'dp_size_local' and 'dp_rank_start'
+Note that the prefiller nodes and the decoder nodes may have different configurations. In this example, each prefiller node deployed as master node independently, but all decoder nodes take the first node as the master node. So it leads to differents in 'dp_size_local' and 'dp_rank_start'
 
 ## Example proxy for Distributed DP Server
 
@@ -395,7 +395,7 @@ python load_balance_proxy_server_example.py \
 
 You can get the proxy program in the repository's examples, [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/v0.9.1-dev/examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py)
 
-## Benckmark
+## Benchmark
 
 We recommend use aisbench tool to assess performance. [aisbench](https://gitee.com/aisbench/benchmark) Execute the following commands to install aisbench
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -39,6 +39,7 @@
 # we not explicitly patch here, some of them might be effectiveless
 # in pytest scenario
 from vllm_ascend.utils import adapt_patch  # noqa E402
+from vllm_ascend.ascend_config import clear_ascend_config
 
 adapt_patch(True)
 
@@ -348,6 +349,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         del self.model
+        clear_ascend_config()
         cleanup_dist_env_and_memory()
 
 
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -127,6 +127,25 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
         cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
+        ascend_scheduler_config = ascend_config.ascend_scheduler_config
+
+        if not model_config.use_mla:
+            logger.info(
+                "Non-MLA LLMs forcibly disable the chunked prefill feature,"
+                "as the performance of operators supporting this feature "
+                "functionality is currently suboptimal.")
+            scheduler_config.enable_chunked_prefill = False
+            scheduler_config.chunked_prefill_enabled = False
+            if envs.VLLM_USE_V1 and \
+                not model_config.is_multimodal_model and \
+                not scheduler_config.delay_factor > 0 and \
+                not scheduler_config.send_delta_data and \
+                scheduler_config.policy == "fcfs" and \
+                scheduler_config.num_scheduler_steps == 1:
+                ascend_scheduler_config.enabled = True
+                if hasattr(ascend_scheduler_config, "enable_chunked_prefill"):
+                    ascend_scheduler_config.enable_chunked_prefill = False
 
         if parallel_config:
             if parallel_config.enable_expert_parallel: