vllm-project · LiuXiaoxuanPKU · Mar 23, 2025 · Mar 6, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
@@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="facebook/opt-6.7b",
     tensor_parallel_size=1,
-    speculative_model="facebook/opt-125m",
-    num_speculative_tokens=5,
+    speculative_config={
+        "model": "facebook/opt-125m",
+        "num_speculative_tokens": 5,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -45,10 +47,14 @@ To perform the same with an online mode launch the server:
 
 ```bash
 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --speculative_model facebook/opt-125m \
-    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+    --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```
 
+:::{warning}
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+:::
+
 Then use a client:
 
 ```python
@@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="facebook/opt-6.7b",
     tensor_parallel_size=1,
-    speculative_model="[ngram]",
-    num_speculative_tokens=5,
-    ngram_prompt_lookup_max=4,
+    speculative_config={
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 4,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3.1-70B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "ibm-ai-platform/llama3-70b-accelerator",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3-8B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 
 outputs = llm.generate(prompts, sampling_params)
@@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models:
    be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
    If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
    [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
-   and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
-   the latest version of vLLM, please leave a comment or raise an issue.
+   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
 
 2. The EAGLE based draft models need to be run without tensor parallelism
-   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
    it is possible to run the main model using tensor parallelism (see example above).
 
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is

diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
@@ -50,7 +50,9 @@ def time_generation(llm: LLM, prompts: list[str],
     # Create an LLM with spec decoding
     llm = LLM(
         model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-ai-platform/llama-13b-accelerator",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
     )
 
     print("With speculation")

diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
@@ -56,7 +56,7 @@ def generate():
 def maybe_assert_ngram_worker(llm):
     # Verify the proposer worker is ngram if ngram is specified.
     if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
+            and llm.llm_engine.speculative_config.method == "ngram"):
         from vllm.spec_decode.ngram_worker import NGramWorker
         assert isinstance(
             llm.llm_engine.model_executor.driver_worker.proposer_worker,

diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
@@ -7,28 +7,39 @@
 from .conftest import get_output_from_llm_generator
 
 
-@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-3.2-1B-Instruct",
-    "speculative_model": "JackFram/llama-68m",
-    "num_speculative_tokens": 5,
-}])
+@pytest.mark.parametrize("common_llm_kwargs",
+                         [{
+                             "model": "meta-llama/Llama-3.2-1B-Instruct",
+                         }])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
         {
             # Speculative max model len > overridden max model len should raise.
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 129,
+            },
             "max_model_len": 128,
-            "speculative_max_model_len": 129,
         },
         {
             # Speculative max model len > draft max model len should raise.
             # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
-            "speculative_max_model_len": 2048 + 1,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 2048 + 1,
+            },
         },
         {
             # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
-            "speculative_max_model_len": 131072 + 1,
+            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 131072 + 1,
+            },
         },
     ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])

diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -57,8 +57,10 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -95,18 +97,19 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": SPEC_MODEL,
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
         "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "disable_logprobs": False,
     },
-    {
-        "speculative_model": SPEC_MODEL,
+}, {
+    "speculative_config": {
+        "model": SPEC_MODEL,
         "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "disable_logprobs": True,
     },
-])
+}])
 @pytest.mark.parametrize("output_len", [
     128,
 ])
@@ -119,18 +122,19 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                    batch_size: int, output_len: int, seed: int,
                                    logprobs: int):
 
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -151,8 +155,10 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -193,8 +199,10 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -236,8 +244,10 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
     "test_llm_kwargs",
     [
         {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
@@ -277,12 +287,13 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -324,8 +335,10 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-llama2-chat-7B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -372,8 +385,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -420,8 +435,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(