From c332ed48ae143d0260dbf83f5ea906c8f17b95cc Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 25 Sep 2025 09:26:27 -0400 Subject: [PATCH 1/2] remove oracle Signed-off-by: Matthew Bonanni --- .buildkite/test-pipeline.yaml | 1 - vllm/engine/arg_utils.py | 104 +--------------------------------- 2 files changed, 1 insertion(+), 104 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 200ed344c4e8..74ca7b21ae06 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -302,7 +302,6 @@ steps: - pytest -v -s v1/metrics - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_metrics_reader.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3d48d2a0b22d..592fc2c98af5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1131,15 +1131,7 @@ def create_engine_config( ) model_config = self.create_model_config() - # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" - # and fall back to V0 for experimental or unsupported features. - # * If VLLM_USE_V1=1, we enable V1 for supported + experimental - # features and raise error for unsupported features. - # * If VLLM_USE_V1=0, we disable V1. - use_v1 = False - try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1") - if try_v1 and self._is_v1_supported_oracle(model_config): - use_v1 = True + use_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1") # If user explicitly set VLLM_USE_V1, sanity check we respect it. if envs.is_set("VLLM_USE_V1"): @@ -1437,100 +1429,6 @@ def create_engine_config( return config - def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: - """Oracle for whether to use V0 or V1 Engine by default.""" - - ############################################################# - # Unsupported Feature Flags on V1. - - if (self.logits_processor_pattern - != EngineArgs.logits_processor_pattern): - _raise_or_fallback(feature_name="--logits-processor-pattern", - recommend_to_remove=False) - return False - - # No Mamba or Encoder-Decoder so far. - if not model_config.is_v1_compatible: - _raise_or_fallback(feature_name=model_config.architectures, - recommend_to_remove=False) - return False - - # No Concurrent Partial Prefills so far. - if (self.max_num_partial_prefills - != SchedulerConfig.max_num_partial_prefills - or self.max_long_partial_prefills - != SchedulerConfig.max_long_partial_prefills): - _raise_or_fallback(feature_name="Concurrent Partial Prefill", - recommend_to_remove=False) - return False - - # V1 supports N-gram, Medusa, and Eagle speculative decoding. - if self.speculative_config is not None: - # speculative_config could still be a dict at this point - if isinstance(self.speculative_config, dict): - method = self.speculative_config.get("method", None) - else: - method = self.speculative_config.method - - if method == "draft_model": - raise NotImplementedError( - "Draft model speculative decoding is not supported yet. " - "Please consider using other speculative decoding methods " - "such as ngram, medusa, eagle, or deepseek_mtp.") - - V1_BACKENDS = [ - "FLASH_ATTN_VLLM_V1", - "FLASH_ATTN", - "PALLAS", - "PALLAS_VLLM_V1", - "TRITON_ATTN_VLLM_V1", - "TRITON_MLA", - "CUTLASS_MLA", - "FLASHMLA", - "FLASHMLA_VLLM_V1", - "FLASH_ATTN_MLA", - "FLASHINFER", - "FLASHINFER_VLLM_V1", - "FLASHINFER_MLA", - "ROCM_AITER_MLA", - "TORCH_SDPA_VLLM_V1", - "FLEX_ATTENTION", - "TREE_ATTN", - "XFORMERS_VLLM_V1", - "ROCM_ATTN_VLLM_V1", - ] - if (envs.is_set("VLLM_ATTENTION_BACKEND") - and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): - name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}" - _raise_or_fallback(feature_name=name, recommend_to_remove=True) - return False - - ############################################################# - # Experimental Features - allow users to opt in. - - if self.pipeline_parallel_size > 1: - supports_pp = getattr(self.distributed_executor_backend, - 'supports_pp', False) - if not supports_pp and self.distributed_executor_backend not in ( - ParallelConfig.distributed_executor_backend, "ray", "mp", - "external_launcher"): - name = "Pipeline Parallelism without Ray distributed " \ - "executor or multiprocessing executor or external " \ - "launcher" - _raise_or_fallback(feature_name=name, - recommend_to_remove=False) - return False - - if (current_platform.is_cpu() - and model_config.get_sliding_window() is not None): - _raise_or_fallback(feature_name="sliding window (CPU backend)", - recommend_to_remove=False) - return False - - ############################################################# - - return True - def _set_default_args(self, usage_context: UsageContext, model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" From a43907de06a6f9a4d7a8568b510b40ba26b2bc32 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 25 Sep 2025 09:34:22 -0400 Subject: [PATCH 2/2] remove test_oracle.py Signed-off-by: Matthew Bonanni --- tests/v1/test_oracle.py | 79 ----------------------------------------- 1 file changed, 79 deletions(-) delete mode 100644 tests/v1/test_oracle.py diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py deleted file mode 100644 index f6b8a18dd7c2..000000000000 --- a/tests/v1/test_oracle.py +++ /dev/null @@ -1,79 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os - -import pytest - -import vllm.envs as envs -from vllm import LLM -from vllm.engine.arg_utils import AsyncEngineArgs - -MODEL = "meta-llama/Llama-3.2-1B-Instruct" - - -def test_reject_bad_config(monkeypatch): - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - - -def test_unsupported_configs(monkeypatch): - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - speculative_config={ - "model": MODEL, - }, - ).create_engine_config() - - -def test_enable_by_default_fallback(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - - # Should default to V1 for supported config. - _ = AsyncEngineArgs( - model=MODEL, - enforce_eager=True, - ).create_engine_config() - assert envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1") - - -def test_v1_llm_by_default(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - - # Should default to V1 for supported config. - llm = LLM(MODEL, enforce_eager=True, enable_lora=True) - print(llm.generate("Hello my name is")) - assert hasattr(llm.llm_engine, "engine_core") - m.delenv("VLLM_USE_V1") - - -def test_v1_attn_backend(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") - - # Fall back to V0. - _ = AsyncEngineArgs(model=MODEL).create_engine_config() - assert not envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1") - - # Reject if V1. - m.setenv("VLLM_USE_V1", "1") - with pytest.raises(NotImplementedError): - AsyncEngineArgs(model=MODEL).create_engine_config() - m.delenv("VLLM_USE_V1") - - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA") - _ = AsyncEngineArgs(model=MODEL).create_engine_config() - assert envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1")