huggingface · gante · Jan 19, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -4807,7 +4807,7 @@ def _speculative_sampling(
     # (= keep with p = probability_ratio). Keep all the tokens until the first rejection
     r_i = torch.rand_like(probability_ratio)
     is_accepted = r_i <= probability_ratio
-    n_matches = (~is_accepted.cumsum(dim=-1) < 1).sum()  # this is `n` in algorithm 1
+    n_matches = ((~is_accepted).cumsum(dim=-1) < 1).sum()  # this is `n` in algorithm 1
 
     # Ensure we don't generate beyond max_len or an EOS token (not in algorithm 1, but needed for correct behavior)
     if last_assistant_token_is_eos and n_matches == candidate_length:
@@ -4819,14 +4819,23 @@ def _speculative_sampling(
     p_n_plus_1 = p[:, n_matches, :]
     if n_matches < gamma:
         q_n_plus_1 = q[:, n_matches, :]
-        p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0).softmax(dim=-1)
+        p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0)
+        p_prime.div_(p_prime.sum())
     else:
         p_prime = p_n_plus_1
     t = torch.multinomial(p_prime, num_samples=1).squeeze(1)[None, :]
 
     # The selected tokens include the matches (if any) plus the next sampled tokens
     if n_matches > 0:
-        valid_tokens = torch.cat((candidate_input_ids[:, -n_matches:], t), dim=-1)
+        valid_tokens = torch.cat(
+            (
+                candidate_input_ids[
+                    :, -candidate_length : candidate_input_ids.size(1) - candidate_length + n_matches :
+                ],
+                t,
+            ),
+            dim=-1,
+        )
     else:
         valid_tokens = t