Skip to content

Commit 239b7be

Browse files
authored
[V1][Spec Decode] Remove deprecated spec decode config params (#15466)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
1 parent 09e974d commit 239b7be

File tree

10 files changed

+125
-220
lines changed

10 files changed

+125
-220
lines changed

.buildkite/nightly-benchmarks/tests/serving-tests.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@
6363
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
6464
"disable_log_requests": "",
6565
"tensor_parallel_size": 4,
66-
"swap_space": 16,
67-
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
68-
"num_speculative_tokens": 4,
69-
"speculative_draft_tensor_parallel_size": 1
66+
"swap_space": 16,
67+
"speculative_config": {
68+
"model": "turboderp/Qwama-0.5B-Instruct",
69+
"num_speculative_tokens": 4,
70+
"draft_tensor_parallel_size": 1
71+
}
7072
},
7173
"client_parameters": {
7274
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

docs/source/features/spec_decode.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
5252
```
5353

5454
:::{warning}
55-
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
55+
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
5656
:::
5757

5858
Then use a client:

examples/offline_inference/eagle.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,12 @@
6969
max_model_len=max_model_len,
7070
max_num_seqs=args.max_num_seqs,
7171
gpu_memory_utilization=0.8,
72-
speculative_model=eagle_dir,
73-
num_speculative_tokens=args.num_spec_tokens,
74-
speculative_draft_tensor_parallel_size=args.draft_tp,
75-
speculative_max_model_len=max_model_len,
72+
speculative_config={
73+
"model": eagle_dir,
74+
"num_speculative_tokens": args.num_spec_tokens,
75+
"draft_tensor_parallel_size": args.draft_tp,
76+
"max_model_len": max_model_len,
77+
},
7678
disable_log_stats=False,
7779
)
7880

tests/metrics/test_metrics.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,10 @@ def test_metric_spec_decode(
248248
dtype=dtype,
249249
disable_log_stats=False,
250250
gpu_memory_utilization=0.4,
251-
speculative_model=model,
252-
num_speculative_tokens=k,
251+
speculative_config={
252+
"model": model,
253+
"num_speculative_tokens": k,
254+
},
253255
) as vllm_model:
254256

255257
# Force log interval to be 0 to catch all metrics.
@@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
300302
dtype=dtype,
301303
disable_log_stats=False,
302304
gpu_memory_utilization=0.4,
303-
speculative_model=model,
304-
num_speculative_tokens=k,
305+
speculative_config={
306+
"model": model,
307+
"num_speculative_tokens": k,
308+
},
305309
enforce_eager=True,
306310
)
307311

tests/models/test_initialization.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ def _initalize_kv_caches_v1(self, vllm_config):
5454
model_info.default,
5555
tokenizer=model_info.tokenizer,
5656
tokenizer_mode=model_info.tokenizer_mode,
57-
speculative_model=model_info.speculative_model,
58-
num_speculative_tokens=1 if model_info.speculative_model else None,
57+
speculative_config={
58+
"model": model_info.speculative_model,
59+
"num_speculative_tokens": 1,
60+
} if model_info.speculative_model else None,
5961
trust_remote_code=model_info.trust_remote_code,
6062
load_format="dummy",
6163
hf_overrides=hf_overrides,

tests/spec_decode/e2e/test_integration_dist_tp2.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
tensor parallelism.
44
"""
55

6+
import json
67
from typing import Optional
78

89
import pytest
@@ -28,14 +29,14 @@
2829
@pytest.mark.parametrize("test_llm_kwargs", [
2930
[
3031
"--speculative_config",
31-
str({
32+
json.dumps({
3233
"model": "JackFram/llama-68m",
3334
"num_speculative_tokens": 3,
3435
}),
3536
],
3637
[
3738
"--speculative_config",
38-
str({
39+
json.dumps({
3940
"model": "ngram",
4041
"num_speculative_tokens": 5,
4142
"prompt_lookup_max": 3,
@@ -88,15 +89,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
8889
"model, test_llm_kwargs",
8990
[("JackFram/llama-68m", [
9091
"--speculative_config",
91-
str({
92+
json.dumps({
9293
"model": "JackFram/llama-68m",
9394
"num_speculative_tokens": 5,
9495
"draft_tensor_parallel_size": 1,
9596
}),
9697
]),
9798
("ibm-granite/granite-3b-code-instruct", [
9899
"--speculative_config",
99-
str({
100+
json.dumps({
100101
"model": "ibm-granite/granite-3b-code-instruct",
101102
"num_speculative_tokens": 5,
102103
"draft_tensor_parallel_size": 1,
@@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
147148
@pytest.mark.parametrize("model, test_llm_kwargs",
148149
[("JackFram/llama-68m", [
149150
"--speculative_config",
150-
str({
151+
json.dumps({
151152
"model": "JackFram/llama-68m",
152153
"num_speculative_tokens": 3,
153154
}),
154155
]),
155156
("JackFram/llama-68m", [
156157
"--speculative_config",
157-
str({
158+
json.dumps({
158159
"model": "JackFram/llama-68m",
159160
"num_speculative_tokens": 3,
160161
"draft_tensor_parallel_size": 1,
161162
}),
162163
])])
163-
@pytest.mark.parametrize("logprobs", [None, 2])
164+
@pytest.mark.parametrize("logprobs", [None])
164165
@pytest.mark.parametrize("batch_size", [2])
165166
@pytest.mark.parametrize("seed", [1])
166167
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
171172
"""Verify spec decode works well with same and different TP size for
172173
the draft model with chunked prefill.
173174
"""
174-
if logprobs:
175-
test_llm_kwargs.extend(
176-
["--disable_logprobs_during_spec_decoding", "False"])
175+
run_equality_correctness_test_tp(model,
176+
common_llm_kwargs,
177+
per_test_common_llm_kwargs,
178+
baseline_llm_kwargs,
179+
test_llm_kwargs,
180+
batch_size,
181+
max_output_len=32,
182+
seed=seed,
183+
temperature=0.0,
184+
logprobs=logprobs)
185+
186+
187+
@pytest.mark.skipif(torch.cuda.device_count() < 2,
188+
reason="Need at least 2 GPUs to run the test.")
189+
@pytest.mark.parametrize(
190+
"common_llm_kwargs",
191+
[[
192+
# Skip cuda graph recording for fast test.
193+
"--enforce-eager",
194+
"--tensor_parallel_size",
195+
"2",
196+
197+
# precision
198+
"--dtype",
199+
"bfloat16",
200+
]])
201+
@pytest.mark.parametrize(
202+
"per_test_common_llm_kwargs",
203+
[["--enable-chunked-prefill", "False"],
204+
[
205+
"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
206+
"--max-num-seqs", "4"
207+
]])
208+
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
209+
@pytest.mark.parametrize("model, test_llm_kwargs",
210+
[("JackFram/llama-68m", [
211+
"--speculative_config",
212+
json.dumps({
213+
"model": "JackFram/llama-68m",
214+
"num_speculative_tokens": 3,
215+
"disable_logprobs": False,
216+
}),
217+
]),
218+
("JackFram/llama-68m", [
219+
"--speculative_config",
220+
json.dumps({
221+
"model": "JackFram/llama-68m",
222+
"num_speculative_tokens": 3,
223+
"draft_tensor_parallel_size": 1,
224+
"disable_logprobs": False,
225+
}),
226+
])])
227+
@pytest.mark.parametrize("logprobs", [2])
228+
@pytest.mark.parametrize("batch_size", [2])
229+
@pytest.mark.parametrize("seed", [1])
230+
def test_spec_decode_chunked_prefill_tp2_with_logprobs(
231+
model, common_llm_kwargs, per_test_common_llm_kwargs,
232+
baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
233+
batch_size: int, seed: int):
234+
"""Verify spec decode works well with same and different TP size for
235+
the draft model with chunked prefill.
236+
"""
177237
run_equality_correctness_test_tp(model,
178238
common_llm_kwargs,
179239
per_test_common_llm_kwargs,

tests/spec_decode/e2e/test_integration_dist_tp4.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
tensor parallelism.
44
"""
55

6+
import json
7+
68
import openai
79
import pytest
810
import torch
@@ -33,7 +35,7 @@
3335
#TODO(wooyeon): add spec_draft_dp=2 case
3436
[
3537
"--speculative_config",
36-
str({
38+
json.dumps({
3739
"model": f"{SPEC_MODEL}",
3840
"num_speculative_tokens": 5,
3941
"draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
8082
# Artificially limit the draft model max model len; this forces vLLM
8183
# to skip speculation once the sequences grow beyond 32-k tokens.
8284
"--speculative_config",
83-
str({
85+
json.dumps({
8486
"model": f"{SPEC_MODEL}",
8587
"num_speculative_tokens": 5,
8688
"max_model_len": 32,

tests/v1/test_oracle.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
4949
with pytest.raises(NotImplementedError):
5050
AsyncEngineArgs(
5151
model=MODEL,
52-
speculative_model=MODEL,
52+
speculative_config={
53+
"model": MODEL,
54+
},
5355
).create_engine_config()
5456

5557
with pytest.raises(NotImplementedError):

vllm/config.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2047,14 +2047,13 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
20472047

20482048
def __post_init__(self):
20492049

2050-
# Note: After next release, the method parameter will be used to
2051-
# specify the speculative method, which helps to extend the
2052-
# configuration of non-model-based proposers, and the model parameter
2053-
# will be used when the draft model or head is needed.
2054-
# If users do not specify the method, the speculative method will
2055-
# be detected automatically if possible. If the speculative method can
2056-
# not be detected, it will be considered as the draft-model-based
2057-
# method by default.
2050+
# Note: "method" is a new parameter that helps to extend the
2051+
# configuration of non-model-based proposers, and the "model" parameter
2052+
# will be used to set the draft model, eagle head, or additional weight
2053+
# when needed. If users do not specify "method", the speculative method
2054+
# will be detected automatically if possible. If the speculative method
2055+
# can not be detected, it will be considered as the "draft_model" by
2056+
# default.
20582057

20592058
if self.model is None and self.num_speculative_tokens is not None:
20602059
# TODO(Shangming): Refactor mtp configuration logic when supporting
@@ -2069,8 +2068,8 @@ def __post_init__(self):
20692068
raise ValueError("num_speculative_tokens was provided without "
20702069
"speculative model.")
20712070

2072-
# Automatically configure the ngram method during configuration
2073-
# refactoring to ensure a smooth transition.
2071+
# Automatically configure the method for ngram when "model" is used
2072+
# instead of "method"
20742073
if self.method is None and (self.model is not None
20752074
and self.model in ("ngram", "[ngram]")):
20762075
self.method = "ngram"

0 commit comments

Comments
 (0)