Revert #24446 and #26168 (#26332)

tdoublep · web-flow · commit 31a4b3e6c402 · 2025-10-07T16:38:19.000-06:00
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
@@ -85,11 +85,10 @@ def test_max_model_len():
         num_total_tokens = len(output.prompt_token_ids) + len(
             output.outputs[0].token_ids
         )
-        # Total tokens must not exceed max_model_len + 1 (the last token can be
-        # generated with the context length equal to the max model length)
+        # Total tokens must not exceed max_model_len.
         # It can be less if generation finishes due to other reasons (e.g., EOS)
         # before reaching the absolute model length limit.
-        assert num_total_tokens <= max_model_len + 1
+        assert num_total_tokens <= max_model_len
 
 
 def test_log_stats():
diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -223,7 +223,7 @@ def schedule(self) -> SchedulerOutput:
             # Make sure the input position does not exceed the max model len.
             # This is necessary when using spec decoding.
             num_new_tokens = min(
-                num_new_tokens, self.max_model_len - request.num_computed_tokens
+                num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens
             )
 
             # Schedule encoder inputs.
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
@@ -44,7 +44,7 @@ def check_stop(
     request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None
 ) -> bool:
     if (
-        request.num_tokens > max_model_len
+        request.num_tokens >= max_model_len
         or request.num_output_tokens >= request.max_tokens
     ):
         request.status = RequestStatus.FINISHED_LENGTH_CAPPED
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2317,30 +2317,14 @@ def _bookkeeping_sync(
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
             end_idx = start_idx + len(sampled_ids)
-            assert end_idx <= self.max_model_len + 1, (
-                "Sampled token IDs exceed the max model length + 1. "
-                f"Total number of tokens: {end_idx} > max_model_len + 1: "
-                f"{self.max_model_len + 1}"
+            assert end_idx <= self.max_model_len, (
+                "Sampled token IDs exceed the max model length. "
+                f"Total number of tokens: {end_idx} > max_model_len: "
+                f"{self.max_model_len}"
             )
 
-            n_tokens_cache = len(sampled_ids)
-
-            # Sampled token IDs exceed the max model length by 1. This is
-            # legitimate as we can still sample 1 last token when the context
-            # length equals the max model length. Note that we do not need to
-            # cache this token ID as the sequence finishes after this step.
-            # Additionally, the buffers token_ids_cpu and is_token_ids are of
-            # size max model length only.
-            if end_idx == self.max_model_len + 1:
-                n_tokens_cache -= 1
-
-            self.input_batch.token_ids_cpu[
-                req_idx, start_idx : (start_idx + n_tokens_cache)
-            ] = sampled_ids[:n_tokens_cache]
-            self.input_batch.is_token_ids[
-                req_idx, start_idx : (start_idx + n_tokens_cache)
-            ] = True
-
+            self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids
+            self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
 

Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ def schedule(self) -> SchedulerOutput:`
`223`	`223`	`# Make sure the input position does not exceed the max model len.`
`224`	`224`	`# This is necessary when using spec decoding.`
`225`	`225`	`num_new_tokens = min(`
`226`		`- num_new_tokens, self.max_model_len - request.num_computed_tokens`
	`226`	`+ num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens`
`227`	`227`	`)`
`228`	`228`
`229`	`229`	`# Schedule encoder inputs.`