diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 4676dc992a87..3c3281c34d56 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -25,6 +25,12 @@ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + """We can run both engines for this test.""" + pass + + @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to @@ -104,3 +110,19 @@ def test_multiple_sampling_params(llm: LLM): # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) + + +def test_max_model_len(): + max_model_len = 20 + llm = LLM( + model=MODEL_NAME, + max_model_len=max_model_len, + gpu_memory_utilization=0.10, + enforce_eager=True, # reduce test time + ) + sampling_params = SamplingParams(max_tokens=max_model_len + 10) + outputs = llm.generate(PROMPTS, sampling_params) + for output in outputs: + num_total_tokens = len(output.prompt_token_ids) + len( + output.outputs[0].token_ids) + assert num_total_tokens == max_model_len diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 7925d91f6064..3fb2f71b5e99 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -82,7 +82,7 @@ def maybe_stop_sequence( return # Check if the sequence has reached max_model_len. - if seq.get_len() > self._get_max_model_len(lora_req): + if seq.get_len() >= self._get_max_model_len(lora_req): seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED return