fix modeling cohere (huggingface#14)

LysandreJik · Mar 15, 2024 · bb7f728 · bb7f728
1 parent aeb5908
commit bb7f728
Showing 1 changed file with 26 additions and 21 deletions.
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
@@ -551,7 +551,7 @@ def forward(
  past_key_value = getattr(self, "past_key_value", past_key_value)
 
  if past_key_value is not None:
- # sin and cos are specific to RoPE models; position_ids needed for the static cache
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
  key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
@@ -1143,14 +1143,26 @@ def forward(
  )
 
  def prepare_inputs_for_generation(
- self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
  ):
+ # With static cache, the `past_key_values` is None
+ # TODO joao: standardize interface for the different Cache classes and remove of this if
+ has_static_cache = False
+ if past_key_values is None:
+ past_key_values = getattr(self.model.layers[0].self_attn, "past_key_value", None)
+ has_static_cache = past_key_values is not None
+
  past_length = 0
  if past_key_values is not None:
  if isinstance(past_key_values, Cache):
- cache_length = past_key_values.get_seq_length()
- past_length = past_key_values.seen_tokens
- max_cache_length = past_key_values.get_max_length()
+ past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+ max_cache_length = (
+ torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
+ if past_key_values.get_max_length() is not None
+ else None
+ )
+ cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
+ # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
  else:
  cache_length = past_length = past_key_values[0][0].shape[2]
  max_cache_length = None
@@ -1183,22 +1195,6 @@ def prepare_inputs_for_generation(
  if past_key_values:
  position_ids = position_ids[:, -input_ids.shape[1] :]
 
- if self.generation_config.cache_implementation == "static":
- # generation with static cache
- cache_position = kwargs.get("cache_position", None)
- if cache_position is None:
- past_length = 0
- else:
- past_length = cache_position[-1] + 1
- input_ids = input_ids[:, past_length:]
- position_ids = position_ids[:, past_length:]
-
- # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
- # same goes for position ids. Could also help with continued generation.
- input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
- cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
- position_ids = position_ids.contiguous() if position_ids is not None else None
-
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
  if inputs_embeds is not None and past_key_values is None:
  model_inputs = {"inputs_embeds": inputs_embeds}
@@ -1208,6 +1204,15 @@ def prepare_inputs_for_generation(
  # TODO: use `next_tokens` directly instead.
  model_inputs = {"input_ids": input_ids.contiguous()}
 
+ input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+ if cache_position is None:
+ cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
+ else:
+ cache_position = cache_position[-input_length:]
+
+ if has_static_cache:
+ past_key_values = None
+
  model_inputs.update(
  {
  "position_ids": position_ids,