11# SPDX-License-Identifier: Apache-2.0
2+ """
3+ This file demonstrates the usage of text generation with an LLM model,
4+ comparing the performance with and without speculative decoding.
5+
6+ Note that still not support `v1`:
7+ VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
8+ """
29
310import gc
411import time
714
815
916def time_generation (llm : LLM , prompts : list [str ],
10- sampling_params : SamplingParams ):
17+ sampling_params : SamplingParams , title : str ):
1118 # Generate texts from the prompts. The output is a list of RequestOutput
1219 # objects that contain the prompt, generated text, and other information.
1320 # Warmup first
@@ -16,11 +23,15 @@ def time_generation(llm: LLM, prompts: list[str],
1623 start = time .time ()
1724 outputs = llm .generate (prompts , sampling_params )
1825 end = time .time ()
19- print ((end - start ) / sum ([len (o .outputs [0 ].token_ids ) for o in outputs ]))
26+ print ("-" * 50 )
27+ print (title )
28+ print ("time: " ,
29+ (end - start ) / sum (len (o .outputs [0 ].token_ids ) for o in outputs ))
2030 # Print the outputs.
2131 for output in outputs :
2232 generated_text = output .outputs [0 ].text
2333 print (f"text: { generated_text !r} " )
34+ print ("-" * 50 )
2435
2536
2637if __name__ == "__main__" :
@@ -41,8 +52,7 @@ def time_generation(llm: LLM, prompts: list[str],
4152 # Create an LLM without spec decoding
4253 llm = LLM (model = "meta-llama/Llama-2-13b-chat-hf" )
4354
44- print ("Without speculation" )
45- time_generation (llm , prompts , sampling_params )
55+ time_generation (llm , prompts , sampling_params , "Without speculation" )
4656
4757 del llm
4858 gc .collect ()
@@ -55,5 +65,4 @@ def time_generation(llm: LLM, prompts: list[str],
5565 },
5666 )
5767
58- print ("With speculation" )
59- time_generation (llm , prompts , sampling_params )
68+ time_generation (llm , prompts , sampling_params , "With speculation" )
0 commit comments