Skip to content

Commit 8fddc1d

Browse files
committed
[V0.11.0][Core] Restore scheduling logic under default configuration (vllm-project#3967)
This PR reverts the changes introduced in PR vllm-project#2894 Initially, due to performance issues with the older version of the chunked prefill ops, the default behavior was to use the Ascend scheduler to disable the chunked prefill feature. However, with the improvements in the performance of the new chunked prefill ops, this interception strategy has been removed. This change also aligns with the community's default configuration behavior. No. CI passed with new added/existing test. - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@83f478b Signed-off-by: rjg-lyh <1318825571@qq.com>
1 parent c3c9138 commit 8fddc1d

File tree

2 files changed

+12
-32
lines changed

2 files changed

+12
-32
lines changed

tests/ut/test_platform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ def test_aclgraph_enable(self):
754754
self.platform.check_and_update_config(VllmConfig)
755755
self.assertTrue(
756756
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
757-
"using only ACL Graph mode" in cm.output[1])
757+
"using only ACL Graph mode" in cm.output[0])
758758
self.assertEqual(
759759
VllmConfig.compilation_config.level,
760760
CompilationLevel.PIECEWISE,

vllm_ascend/platform.py

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -129,36 +129,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
129129
model_config = vllm_config.model_config
130130
parallel_config = vllm_config.parallel_config
131131
cache_config = vllm_config.cache_config
132-
scheduler_config = vllm_config.scheduler_config
133132
ascend_scheduler_config = ascend_config.ascend_scheduler_config
134-
structured_outputs_config = vllm_config.structured_outputs_config
135-
136-
if (model_config is not None and not model_config.use_mla
137-
and not scheduler_config.async_scheduling
138-
and model_config.runner_type != "pooling"):
139-
logger.info(
140-
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
141-
"as the performance of operators supporting this feature "
142-
"functionality is currently suboptimal.")
143-
if not model_config.is_multimodal_model and \
144-
structured_outputs_config.backend == "auto" and \
145-
not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \
146-
not scheduler_config.send_delta_data and \
147-
scheduler_config.policy == "fcfs":
148-
ascend_scheduler_config.enabled = True
149-
chunked_prefill_enabled_in_ascend_scheduler = getattr(
150-
ascend_scheduler_config, "enable_chunked_prefill", False)
151-
if chunked_prefill_enabled_in_ascend_scheduler:
152-
logger.warning(
153-
"Chunked prefill feature is enabled in ascend_scheduler,"
154-
"but note that the operator supporting this feature "
155-
"would lead to performance degradation.")
156-
# In this situation, max_num_batched_tokens would have been rewritten.
157-
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
158-
if (scheduler_config.max_num_batched_tokens
159-
< scheduler_config.max_model_len
160-
and not chunked_prefill_enabled_in_ascend_scheduler):
161-
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
162133

163134
kv_cache_dtype = vllm_config.additional_config.get(
164135
"kv_cache_dtype", None)
@@ -293,11 +264,20 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
293264
if cache_config.block_size is None:
294265
cache_config.block_size = 128
295266

296-
if cache_config.enable_prefix_caching and cache_config.block_size != 128:
267+
if cache_config.enable_prefix_caching or \
268+
not ascend_scheduler_config.enabled or \
269+
getattr(ascend_scheduler_config, "enable_chunked_prefill", False):
297270
logger.warning(
298-
"If prefix caching is enabled, block size must be set to 128."
271+
"If chunked prefill or prefix caching is enabled, block size must be set to 128."
299272
)
273+
origin_block_size = cache_config.block_size
300274
cache_config.block_size = 128
275+
# TODO(MengqingCao): Remove the model_type check, after resolving the hidden error in get_kv_cache_groups.
276+
if model_config and model_config.hf_config.model_type == "qwen3_next":
277+
logger.warning(
278+
"When running qwen3-next model, block_size needs to be restored to its original value."
279+
)
280+
cache_config.block_size = origin_block_size
301281

302282
# Activate custom ops for v1, except on 310P
303283
if not is_310p():

0 commit comments

Comments
 (0)