vllm-project
diff --git a/‎tests/spec_decode/e2e/test_eagle_correctness.py‎
Lines changed: 483 additions & 0 deletions b/‎tests/spec_decode/e2e/test_eagle_correctness.py‎
Lines changed: 483 additions & 0 deletions
diff --git a/‎tests/spec_decode/e2e/test_mtp_correctness.py‎
Lines changed: 340 additions & 0 deletions b/‎tests/spec_decode/e2e/test_mtp_correctness.py‎
Lines changed: 340 additions & 0 deletions
diff --git a/‎vllm_ascend/device_allocator/camem.py‎
Lines changed: 1 addition & 3 deletions b/‎vllm_ascend/device_allocator/camem.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎vllm_ascend/patch/patch_spec_decode_worker.py‎
Lines changed: 1 addition & 3 deletions b/‎vllm_ascend/patch/patch_spec_decode_worker.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎vllm_ascend/platform.py‎
Lines changed: 1 addition & 3 deletions b/‎vllm_ascend/platform.py‎
Lines changed: 1 addition & 3 deletions
@@ -0,0 +1,340 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_mtp_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, mtp would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_equality_correctness_test
+
+# main model
+# NOTE vLLM use fp8 model, vllm-ascend use bf16 model
+MAIN_MODEL = "wemaster/deepseek_mtp_main_random_bf16"
+
+# max. number of speculative tokens: this corresponds to
+# num_nextn_predict_layers in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 1
+
+# NOTE vLLM use bfloat, vllm-ascend should use float32.
+# when vllm-ascend support e2e float32, it should be set to float32.
+# precision
+PRECISION = "bfloat16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                    per_test_common_llm_kwargs,
+                                    baseline_llm_kwargs, test_llm_kwargs,
+                                    batch_size: int, output_len: int,
+                                    seed: int):
+
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": False,
+    },
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": True,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs, test_llm_kwargs,
+                                 batch_size: int, output_len: int, seed: int,
+                                 logprobs: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  logprobs=logprobs,
+                                  prompt_logprobs=logprobs,
+                                  disable_logprobs=test_llm_kwargs[
+                                      'disable_logprobs_during_spec_decoding'])
+
+
+# TODO: Open it when vllm-ascend support graph mode and
+# support enforce_eager status is False to run model in graph mode
+# @pytest.mark.parametrize(
+#     "common_llm_kwargs",
+#     [{
+#         "enforce_eager": False,
+
+#         # Print spec metrics.
+#         "disable_log_stats": False,
+
+#         # Precision
+#         "dtype": PRECISION,
+
+#         # Main model
+#         "model_name": MAIN_MODEL,
+#         "gpu_memory_utilization": 0.85
+#     }])
+# @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+# @pytest.mark.parametrize("test_llm_kwargs", [
+#     {
+#         "num_speculative_tokens": MAX_SPEC_TOKENS,
+#     },
+# ])
+# @pytest.mark.parametrize("output_len", [
+#     128,
+# ])
+# @pytest.mark.parametrize("batch_size", [1, 32])
+# @pytest.mark.parametrize("seed", [1])
+# def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
+#                                                per_test_common_llm_kwargs,
+#                                                baseline_llm_kwargs,
+#                                                test_llm_kwargs,
+#                                                batch_size: int,
+#                                                output_len: int, seed: int):
+#     """Verify greedy equality with cuda graph enabled and different
+#     batch sizes."""
+#     run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+#                                   per_test_common_llm_kwargs,
+#                                   baseline_llm_kwargs, test_llm_kwargs,
+#                                   batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_different_k(vllm_runner, common_llm_kwargs,
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
+                         test_llm_kwargs, batch_size: int, output_len: int,
+                         seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
@@ -23,7 +23,7 @@
 
 import torch
 from acl.rt import memcpy  # type: ignore # noqa: F401
-from vllm.logger import init_logger
+from vllm.logger import logger
 
 try:
     import torch_npu  # noqa: F401
@@ -32,8 +32,6 @@
 
 from vllm.utils import is_pin_memory_available
 
-logger = init_logger(__name__)
-
 
 def find_loaded_library(lib_name) -> Optional[str]:
     """
 
@@ -18,7 +18,7 @@
 from typing import Any, Dict, Optional
 
 from vllm.config import ParallelConfig
-from vllm.logger import init_logger
+from vllm.logger import logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.layers.spec_decode_base_sampler import \
     SpecDecodeBaseSampler
@@ -34,8 +34,6 @@
 
 from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner
 
-logger = init_logger(__name__)
-
 
 def create_worker(
     cls,
 
@@ -23,7 +23,7 @@
 import torch_npu  # noqa: F401
 import vllm.envs as envs
 from vllm.config import CompilationLevel, VllmConfig
-from vllm.logger import init_logger
+from vllm.logger import logger
 
 try:
     # register custom ops into torch_library here
@@ -46,8 +46,6 @@
 
 os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
 
-logger = init_logger(__name__)
-
 
 class NPUPlatform(Platform):