diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d4d801b155e1..b638e27deee6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1147,20 +1147,15 @@ def create_engine_config( else: envs.set_vllm_use_v1(use_v1) - # Set default arguments for V0 or V1 Engine. - if use_v1: - self._set_default_args_v1(usage_context, model_config) - # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 - if current_platform.is_cpu( - ) and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): - logger.info( - "Chunked prefill is not supported for ARM and POWER " - "and S390X CPUs; " - "disabling it for V1 backend.") - self.enable_chunked_prefill = False - else: - self._set_default_args_v0(model_config) + # Set default arguments for V1 Engine. + self._set_default_args(usage_context, model_config) + # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture( + ) in (CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM): + logger.info("Chunked prefill is not supported for ARM and POWER " + "and S390X CPUs; " + "disabling it for V1 backend.") + self.enable_chunked_prefill = False assert self.enable_chunked_prefill is not None sliding_window: Optional[int] = None @@ -1540,64 +1535,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: return True - def _set_default_args_v0(self, model_config: ModelConfig) -> None: - """Set Default Arguments for V0 Engine.""" - - max_model_len = model_config.max_model_len - use_long_context = max_model_len > 32768 - if self.enable_chunked_prefill is None: - # Chunked prefill not supported for Multimodal or MLA in V0. - if model_config.is_multimodal_model or model_config.use_mla: - self.enable_chunked_prefill = False - - # Enable chunked prefill by default for long context (> 32K) - # models to avoid OOM errors in initial memory profiling phase. - elif use_long_context: - is_gpu = current_platform.is_cuda() - use_sliding_window = (model_config.get_sliding_window() - is not None) - use_spec_decode = self.speculative_config is not None - - if (is_gpu and not use_sliding_window and not use_spec_decode - and not self.enable_lora): - self.enable_chunked_prefill = True - logger.warning( - "Chunked prefill is enabled by default for models " - "with max_model_len > 32K. Chunked prefill might " - "not work with some features or models. If you " - "encounter any issues, please disable by launching " - "with --enable-chunked-prefill=False.") - - if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False - - if not self.enable_chunked_prefill and use_long_context: - logger.warning( - "The model has a long context length (%s). This may cause" - "OOM during the initial memory profiling phase, or result " - "in low performance due to small KV cache size. Consider " - "setting --max-model-len to a smaller value.", max_model_len) - - # Disable prefix caching for multimodal models for VLLM_V0. - if self.enable_prefix_caching and model_config.is_multimodal_model: - logger.warning( - "--enable-prefix-caching is not supported for multimodal " - "models in V0 and has been disabled.") - self.enable_prefix_caching = False - - if self.enable_prompt_embeds: - logger.warning( - "--enable-prompt-embeds and --enable-prefix-caching " - "are not supported together in V0. Prefix caching has " - "been disabled.") - self.enable_prefix_caching = False - - # Set max_num_seqs to 256 for VLLM_V0. - if self.max_num_seqs is None: - self.max_num_seqs = 256 - - def _set_default_args_v1(self, usage_context: UsageContext, - model_config: ModelConfig) -> None: + def _set_default_args(self, usage_context: UsageContext, + model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" # V1 always uses chunked prefills and prefix caching