Skip to content

Commit c30a6b7

Browse files
kashifCyrilvallez
authored andcommitted
[pagged-attention] fix off-by-1 error in pagged attention generation (#39258)
* fix off-by-1 error in pagged attention generation * formatting * use update_with_token
1 parent be7d1a9 commit c30a6b7

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

src/transformers/generation/continuous_batching.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ def update_with_token(self, token_id: int) -> bool:
122122
is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
123123
is_max_len = self.generated_len() >= self.max_new_tokens
124124

125+
# Only add the token if we're not finishing due to max length
126+
# (EOS tokens should still be added to the output)
127+
if not (is_max_len and not is_eos):
128+
self.static_outputs.extend([token_id])
129+
125130
if is_eos or is_max_len:
126131
self.status = RequestStatus.FINISHED
127132
return True
@@ -1011,7 +1016,6 @@ def update_batch(self):
10111016
self.metrics.record_ttft_metric(state.created_time, state.request_id)
10121017
state.status = RequestStatus.DECODING
10131018
token = out_tokens[self.logits_indices[i]]
1014-
state.static_outputs.extend([token])
10151019
state.prompt_ids = [token]
10161020
if state.update_with_token(token):
10171021
self.metrics.record_request_completion(state.created_time, state.request_id)

0 commit comments

Comments
 (0)