1111@pytest .fixture
1212def test_prompts ():
1313 prompt_types = ["repeat" , "sentence" ]
14- num_prompts = 100
14+ num_prompts = 10
1515 prompts = []
1616
1717 random .seed (0 )
@@ -69,6 +69,7 @@ def test_ngram_correctness(
6969 Compare the outputs of a original LLM and a speculative LLM
7070 should be the same when using ngram speculative decoding.
7171 '''
72+ pytest .skip ("Not current support for the test." )
7273 with monkeypatch .context () as m :
7374 m .setenv ("VLLM_USE_V1" , "1" )
7475
@@ -116,11 +117,12 @@ def test_eagle_correctness(
116117 Compare the outputs of a original LLM and a speculative LLM
117118 should be the same when using eagle speculative decoding.
118119 '''
119- pytest .skip ("Not current support for the test." )
120+ if not use_eagle3 :
121+ pytest .skip ("Not current support for the test." )
120122 with monkeypatch .context () as m :
121123 m .setenv ("VLLM_USE_V1" , "1" )
122124
123- ref_llm = LLM (model = model_name , max_model_len = 2048 )
125+ ref_llm = LLM (model = model_name , max_model_len = 2048 , enforce_eager = True )
124126 ref_outputs = ref_llm .chat (test_prompts , sampling_config )
125127 del ref_llm
126128
@@ -129,13 +131,17 @@ def test_eagle_correctness(
129131 spec_llm = LLM (
130132 model = model_name ,
131133 trust_remote_code = True ,
134+ enable_chunked_prefill = True ,
135+ max_num_seqs = 1 ,
136+ max_num_batched_tokens = 2048 ,
137+ gpu_memory_utilization = 0.6 ,
132138 speculative_config = {
133139 "method" : "eagle3" if use_eagle3 else "eagle" ,
134140 "model" : spec_model_name ,
135- "num_speculative_tokens" : 3 ,
136- "max_model_len" : 2048 ,
141+ "num_speculative_tokens" : 2 ,
142+ "max_model_len" : 128 ,
137143 },
138- max_model_len = 2048 ,
144+ max_model_len = 128 ,
139145 enforce_eager = True ,
140146 )
141147 spec_outputs = spec_llm .chat (test_prompts , sampling_config )
0 commit comments