[pagged-attention] fix off-by-1 error in pagged attention generation (#39258)

kashif · Cyrilvallez · commit c30a6b7bd7aa · 2025-07-11T13:06:28.000+02:00
* fix off-by-1 error in pagged attention generation

* formatting

* use update_with_token
diff --git a/src/transformers/generation/continuous_batching.py b/src/transformers/generation/continuous_batching.py
@@ -122,6 +122,11 @@ def update_with_token(self, token_id: int) -> bool:
         is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
         is_max_len = self.generated_len() >= self.max_new_tokens
 
+        # Only add the token if we're not finishing due to max length
+        # (EOS tokens should still be added to the output)
+        if not (is_max_len and not is_eos):
+            self.static_outputs.extend([token_id])
+
         if is_eos or is_max_len:
             self.status = RequestStatus.FINISHED
             return True
@@ -1011,7 +1016,6 @@ def update_batch(self):
                 self.metrics.record_ttft_metric(state.created_time, state.request_id)
                 state.status = RequestStatus.DECODING
                 token = out_tokens[self.logits_indices[i]]
-                state.static_outputs.extend([token])
                 state.prompt_ids = [token]
                 if state.update_with_token(token):
                     self.metrics.record_request_completion(state.created_time, state.request_id)