@@ -1147,20 +1147,15 @@ def create_engine_config(
11471147 else :
11481148 envs .set_vllm_use_v1 (use_v1 )
11491149
1150- # Set default arguments for V0 or V1 Engine.
1151- if use_v1 :
1152- self ._set_default_args_v1 (usage_context , model_config )
1153- # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
1154- if current_platform .is_cpu (
1155- ) and current_platform .get_cpu_architecture () in (
1156- CpuArchEnum .POWERPC , CpuArchEnum .S390X , CpuArchEnum .ARM ):
1157- logger .info (
1158- "Chunked prefill is not supported for ARM and POWER "
1159- "and S390X CPUs; "
1160- "disabling it for V1 backend." )
1161- self .enable_chunked_prefill = False
1162- else :
1163- self ._set_default_args_v0 (model_config )
1150+ # Set default arguments for V1 Engine.
1151+ self ._set_default_args (usage_context , model_config )
1152+ # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
1153+ if current_platform .is_cpu () and current_platform .get_cpu_architecture (
1154+ ) in (CpuArchEnum .POWERPC , CpuArchEnum .S390X , CpuArchEnum .ARM ):
1155+ logger .info ("Chunked prefill is not supported for ARM and POWER "
1156+ "and S390X CPUs; "
1157+ "disabling it for V1 backend." )
1158+ self .enable_chunked_prefill = False
11641159 assert self .enable_chunked_prefill is not None
11651160
11661161 sliding_window : Optional [int ] = None
@@ -1528,64 +1523,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
15281523
15291524 return True
15301525
1531- def _set_default_args_v0 (self , model_config : ModelConfig ) -> None :
1532- """Set Default Arguments for V0 Engine."""
1533-
1534- max_model_len = model_config .max_model_len
1535- use_long_context = max_model_len > 32768
1536- if self .enable_chunked_prefill is None :
1537- # Chunked prefill not supported for Multimodal or MLA in V0.
1538- if model_config .is_multimodal_model or model_config .use_mla :
1539- self .enable_chunked_prefill = False
1540-
1541- # Enable chunked prefill by default for long context (> 32K)
1542- # models to avoid OOM errors in initial memory profiling phase.
1543- elif use_long_context :
1544- is_gpu = current_platform .is_cuda ()
1545- use_sliding_window = (model_config .get_sliding_window ()
1546- is not None )
1547- use_spec_decode = self .speculative_config is not None
1548-
1549- if (is_gpu and not use_sliding_window and not use_spec_decode
1550- and not self .enable_lora ):
1551- self .enable_chunked_prefill = True
1552- logger .warning (
1553- "Chunked prefill is enabled by default for models "
1554- "with max_model_len > 32K. Chunked prefill might "
1555- "not work with some features or models. If you "
1556- "encounter any issues, please disable by launching "
1557- "with --enable-chunked-prefill=False." )
1558-
1559- if self .enable_chunked_prefill is None :
1560- self .enable_chunked_prefill = False
1561-
1562- if not self .enable_chunked_prefill and use_long_context :
1563- logger .warning (
1564- "The model has a long context length (%s). This may cause"
1565- "OOM during the initial memory profiling phase, or result "
1566- "in low performance due to small KV cache size. Consider "
1567- "setting --max-model-len to a smaller value." , max_model_len )
1568-
1569- # Disable prefix caching for multimodal models for VLLM_V0.
1570- if self .enable_prefix_caching and model_config .is_multimodal_model :
1571- logger .warning (
1572- "--enable-prefix-caching is not supported for multimodal "
1573- "models in V0 and has been disabled." )
1574- self .enable_prefix_caching = False
1575-
1576- if self .enable_prompt_embeds :
1577- logger .warning (
1578- "--enable-prompt-embeds and --enable-prefix-caching "
1579- "are not supported together in V0. Prefix caching has "
1580- "been disabled." )
1581- self .enable_prefix_caching = False
1582-
1583- # Set max_num_seqs to 256 for VLLM_V0.
1584- if self .max_num_seqs is None :
1585- self .max_num_seqs = 256
1586-
1587- def _set_default_args_v1 (self , usage_context : UsageContext ,
1588- model_config : ModelConfig ) -> None :
1526+ def _set_default_args (self , usage_context : UsageContext ,
1527+ model_config : ModelConfig ) -> None :
15891528 """Set Default Arguments for V1 Engine."""
15901529
15911530 # V1 always uses chunked prefills and prefix caching
0 commit comments