Skip to content

Commit c625f90

Browse files
authored
[V0 deprecation] Remove _set_default_args_v0 function (#25409)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
1 parent 6fa78d8 commit c625f90

File tree

1 file changed

+11
-72
lines changed

1 file changed

+11
-72
lines changed

vllm/engine/arg_utils.py

Lines changed: 11 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,20 +1147,15 @@ def create_engine_config(
11471147
else:
11481148
envs.set_vllm_use_v1(use_v1)
11491149

1150-
# Set default arguments for V0 or V1 Engine.
1151-
if use_v1:
1152-
self._set_default_args_v1(usage_context, model_config)
1153-
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
1154-
if current_platform.is_cpu(
1155-
) and current_platform.get_cpu_architecture() in (
1156-
CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
1157-
logger.info(
1158-
"Chunked prefill is not supported for ARM and POWER "
1159-
"and S390X CPUs; "
1160-
"disabling it for V1 backend.")
1161-
self.enable_chunked_prefill = False
1162-
else:
1163-
self._set_default_args_v0(model_config)
1150+
# Set default arguments for V1 Engine.
1151+
self._set_default_args(usage_context, model_config)
1152+
# Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
1153+
if current_platform.is_cpu() and current_platform.get_cpu_architecture(
1154+
) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
1155+
logger.info("Chunked prefill is not supported for ARM and POWER "
1156+
"and S390X CPUs; "
1157+
"disabling it for V1 backend.")
1158+
self.enable_chunked_prefill = False
11641159
assert self.enable_chunked_prefill is not None
11651160

11661161
sliding_window: Optional[int] = None
@@ -1528,64 +1523,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
15281523

15291524
return True
15301525

1531-
def _set_default_args_v0(self, model_config: ModelConfig) -> None:
1532-
"""Set Default Arguments for V0 Engine."""
1533-
1534-
max_model_len = model_config.max_model_len
1535-
use_long_context = max_model_len > 32768
1536-
if self.enable_chunked_prefill is None:
1537-
# Chunked prefill not supported for Multimodal or MLA in V0.
1538-
if model_config.is_multimodal_model or model_config.use_mla:
1539-
self.enable_chunked_prefill = False
1540-
1541-
# Enable chunked prefill by default for long context (> 32K)
1542-
# models to avoid OOM errors in initial memory profiling phase.
1543-
elif use_long_context:
1544-
is_gpu = current_platform.is_cuda()
1545-
use_sliding_window = (model_config.get_sliding_window()
1546-
is not None)
1547-
use_spec_decode = self.speculative_config is not None
1548-
1549-
if (is_gpu and not use_sliding_window and not use_spec_decode
1550-
and not self.enable_lora):
1551-
self.enable_chunked_prefill = True
1552-
logger.warning(
1553-
"Chunked prefill is enabled by default for models "
1554-
"with max_model_len > 32K. Chunked prefill might "
1555-
"not work with some features or models. If you "
1556-
"encounter any issues, please disable by launching "
1557-
"with --enable-chunked-prefill=False.")
1558-
1559-
if self.enable_chunked_prefill is None:
1560-
self.enable_chunked_prefill = False
1561-
1562-
if not self.enable_chunked_prefill and use_long_context:
1563-
logger.warning(
1564-
"The model has a long context length (%s). This may cause"
1565-
"OOM during the initial memory profiling phase, or result "
1566-
"in low performance due to small KV cache size. Consider "
1567-
"setting --max-model-len to a smaller value.", max_model_len)
1568-
1569-
# Disable prefix caching for multimodal models for VLLM_V0.
1570-
if self.enable_prefix_caching and model_config.is_multimodal_model:
1571-
logger.warning(
1572-
"--enable-prefix-caching is not supported for multimodal "
1573-
"models in V0 and has been disabled.")
1574-
self.enable_prefix_caching = False
1575-
1576-
if self.enable_prompt_embeds:
1577-
logger.warning(
1578-
"--enable-prompt-embeds and --enable-prefix-caching "
1579-
"are not supported together in V0. Prefix caching has "
1580-
"been disabled.")
1581-
self.enable_prefix_caching = False
1582-
1583-
# Set max_num_seqs to 256 for VLLM_V0.
1584-
if self.max_num_seqs is None:
1585-
self.max_num_seqs = 256
1586-
1587-
def _set_default_args_v1(self, usage_context: UsageContext,
1588-
model_config: ModelConfig) -> None:
1526+
def _set_default_args(self, usage_context: UsageContext,
1527+
model_config: ModelConfig) -> None:
15891528
"""Set Default Arguments for V1 Engine."""
15901529

15911530
# V1 always uses chunked prefills and prefix caching

0 commit comments

Comments
 (0)