File tree Expand file tree Collapse file tree 2 files changed +23
-1
lines changed
vllm/engine/output_processor Expand file tree Collapse file tree 2 files changed +23
-1
lines changed Original file line number Diff line number Diff line change 2525]
2626
2727
28+ @pytest .fixture (autouse = True )
29+ def v1 (run_with_both_engines ):
30+ """We can run both engines for this test."""
31+ pass
32+
33+
2834@pytest .fixture (scope = "module" )
2935def llm ():
3036 # pytest caches the fixture so we use weakref.proxy to
@@ -104,3 +110,19 @@ def test_multiple_sampling_params(llm: LLM):
104110 # sampling_params is None, default params should be applied
105111 outputs = llm .generate (PROMPTS , sampling_params = None )
106112 assert len (PROMPTS ) == len (outputs )
113+
114+
115+ def test_max_model_len ():
116+ max_model_len = 20
117+ llm = LLM (
118+ model = MODEL_NAME ,
119+ max_model_len = max_model_len ,
120+ gpu_memory_utilization = 0.10 ,
121+ enforce_eager = True , # reduce test time
122+ )
123+ sampling_params = SamplingParams (max_tokens = max_model_len + 10 )
124+ outputs = llm .generate (PROMPTS , sampling_params )
125+ for output in outputs :
126+ num_total_tokens = len (output .prompt_token_ids ) + len (
127+ output .outputs [0 ].token_ids )
128+ assert num_total_tokens == max_model_len
Original file line number Diff line number Diff line change @@ -82,7 +82,7 @@ def maybe_stop_sequence(
8282 return
8383
8484 # Check if the sequence has reached max_model_len.
85- if seq .get_len () > self ._get_max_model_len (lora_req ):
85+ if seq .get_len () >= self ._get_max_model_len (lora_req ):
8686 seq .status = SequenceStatus .FINISHED_LENGTH_CAPPED
8787 return
8888
You can’t perform that action at this time.
0 commit comments